[clang] [Clang][AArch64] Add customisable immediate range checking to NEON (PR #100278)

Wed Jul 24 04:33:56 PDT 2024

https://github.com/SpencerAbson updated https://github.com/llvm/llvm-project/pull/100278

>From 5f4790180ced9cf3b66589106017d301772fb393 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Tue, 23 Jul 2024 08:38:32 +0000
Subject: [PATCH 1/3] Rebase to resolve arm_neon.td conflict

---
 clang/include/clang/Basic/TargetBuiltins.h    |  39 +-
 clang/include/clang/Basic/arm_fp16.td         |   2 +-
 .../include/clang/Basic/arm_immcheck_incl.td  |  39 ++
 clang/include/clang/Basic/arm_neon.td         | 374 ++++++++++++------
 clang/include/clang/Basic/arm_neon_incl.td    |  11 +-
 clang/include/clang/Basic/arm_sve_sme_incl.td |  36 +-
 clang/include/clang/Sema/SemaARM.h            |   3 +
 clang/lib/Sema/SemaARM.cpp                    | 151 ++++---
 clang/test/CodeGen/aarch64-neon-vcmla.c       |  60 ++-
 clang/test/Sema/aarch64-neon-vcmla-ranges.c   | 202 ++++++++++
 clang/utils/TableGen/NeonEmitter.cpp          | 133 +++----
 clang/utils/TableGen/SveEmitter.cpp           |   2 +-
 12 files changed, 700 insertions(+), 352 deletions(-)
 create mode 100644 clang/include/clang/Basic/arm_immcheck_incl.td
 create mode 100644 clang/test/Sema/aarch64-neon-vcmla-ranges.c

diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 4333830bf34f2..50e17ad7e1628 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -209,15 +209,45 @@ namespace clang {
         Flags |= QuadFlag;
     }
 
-    EltType getEltType() const { return (EltType)(Flags & EltTypeMask); }
+    EltType getEltType() const { return (EltType)(Flags & EltTypeMask); } 
     bool isPoly() const {
       EltType ET = getEltType();
       return ET == Poly8 || ET == Poly16 || ET == Poly64;
     }
     bool isUnsigned() const { return (Flags & UnsignedFlag) != 0; }
-    bool isQuad() const { return (Flags & QuadFlag) != 0; }
+    bool isQuad() const { return (Flags & QuadFlag) != 0; };
+    unsigned getEltSizeInBits() const {
+      switch(getEltType()){
+        case Int8:
+        case Poly8:
+          return 8;
+        case Int16:
+        case Float16:
+        case Poly16:
+        case BFloat16:
+          return 16;
+        case Int32:
+        case Float32:
+          return 32;
+        case Int64:
+        case Float64:
+        case Poly64:
+          return 64;
+        case Poly128:
+          return 128;
+        default:
+          llvm_unreachable("Invalid NeonTypeFlag!");
+      }
+    }
   };
 
+    // Shared between SVE/SME and NEON
+    enum ArmImmCheckType {
+#define LLVM_GET_ARM_INTRIN_IMMCHECKTYPES
+#include "clang/Basic/arm_sve_typeflags.inc"
+#undef  LLVM_GET_ARM_INTRIN_IMMCHECKTYPES
+    };
+
   /// Flags to identify the types for overloaded SVE builtins.
   class SVETypeFlags {
     uint64_t Flags;
@@ -249,11 +279,6 @@ namespace clang {
 #undef LLVM_GET_SVE_MERGETYPES
     };
 
-    enum ImmCheckType {
-#define LLVM_GET_SVE_IMMCHECKTYPES
-#include "clang/Basic/arm_sve_typeflags.inc"
-#undef LLVM_GET_SVE_IMMCHECKTYPES
-    };
 
     SVETypeFlags(uint64_t F) : Flags(F) {
       EltTypeShift = llvm::countr_zero(EltTypeMask);
diff --git a/clang/include/clang/Basic/arm_fp16.td b/clang/include/clang/Basic/arm_fp16.td
index d36b4617bef5d..42228a3ba1ffa 100644
--- a/clang/include/clang/Basic/arm_fp16.td
+++ b/clang/include/clang/Basic/arm_fp16.td
@@ -76,7 +76,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   def SCALAR_FCVTPUH  : SInst<"vcvtp_u16", "(1U)1", "Sh">;
   def SCALAR_FCVTPUH1 : SInst<"vcvtp_u32", "(1U>)1", "Sh">;
   def SCALAR_FCVTPUH2 : SInst<"vcvtp_u64", "(1U>>)1", "Sh">;
-  let isVCVT_N = 1 in {
+  let isVCVT_N = 1, ImmChecks = [ImmCheck<1, ImmCheck1_16>] in {
     def SCALAR_SCVTFSHO : SInst<"vcvth_n_f16", "(1F)(1!)I", "sUs">;
     def SCALAR_SCVTFSH1O: SInst<"vcvth_n_f16", "(1F<)(1!)I", "iUi">;
     def SCALAR_SCVTFSH2O: SInst<"vcvth_n_f16", "(1F<<)(1!)I", "lUl">;
diff --git a/clang/include/clang/Basic/arm_immcheck_incl.td b/clang/include/clang/Basic/arm_immcheck_incl.td
new file mode 100644
index 0000000000000..3b20248f65040
--- /dev/null
+++ b/clang/include/clang/Basic/arm_immcheck_incl.td
@@ -0,0 +1,39 @@
+class ImmCheckType<int val> {
+  int Value = val;
+}
+
+// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
+def ImmCheck0_31                : ImmCheckType<0>;  // 0..31 (used for e.g. predicate patterns)
+def ImmCheck1_16                : ImmCheckType<1>;  // 1..16
+def ImmCheckExtract             : ImmCheckType<2>;  // 0..(2048/sizeinbits(elt) - 1)
+def ImmCheckShiftRight          : ImmCheckType<3>;  // 1..sizeinbits(elt)
+def ImmCheckShiftRightNarrow    : ImmCheckType<4>;  // 1..sizeinbits(elt)/2
+def ImmCheckShiftLeft           : ImmCheckType<5>;  // 0..(sizeinbits(elt) - 1)
+def ImmCheck0_7                 : ImmCheckType<6>;  // 0..7
+def ImmCheckLaneIndex           : ImmCheckType<7>;  // 0..(128/(1*sizeinbits(elt)) - 1)
+def ImmCheckLaneQIndex          : ImmCheckType<8>;  // (Neon) treat type as Quad
+def ImmCheckLaneIndexCompRotate : ImmCheckType<9>;  // 0..(128/(2*sizeinbits(elt)) - 1)
+def ImmCheckLaneIndexDot        : ImmCheckType<10>; // 0..(128/(4*sizeinbits(elt)) - 1)
+def ImmCheckComplexRot90_270    : ImmCheckType<11>; // [90,270]
+def ImmCheckComplexRotAll90     : ImmCheckType<12>; // [0, 90, 180,270]
+def ImmCheck0_13                : ImmCheckType<13>; // 0..13
+def ImmCheck0_1                 : ImmCheckType<14>; // 0..1
+def ImmCheck0_2                 : ImmCheckType<15>; // 0..2
+def ImmCheck0_3                 : ImmCheckType<16>; // 0..3
+def ImmCheck0_0                 : ImmCheckType<17>; // 0..0
+def ImmCheck0_15                : ImmCheckType<18>; // 0..15
+def ImmCheck0_255               : ImmCheckType<19>; // 0..255
+def ImmCheck2_4_Mul2            : ImmCheckType<20>; // 2, 4
+def ImmCheck1_1                 : ImmCheckType<21>; // 1..1
+def ImmCheck1_3                 : ImmCheckType<22>; // 1..3
+def ImmCheck1_7                 : ImmCheckType<23>; // 1..7
+def ImmCheck1_32                : ImmCheckType<24>; // 1..32
+def ImmCheck1_64                : ImmCheckType<25>; // 1..64
+def ImmCheck0_63                : ImmCheckType<26>; // 0..63
+
+class ImmCheck<int arg, ImmCheckType kind, int eltSizeArg = -1> {
+  int Arg = arg;
+  // The index of the argument whose type should be referred to when validating this immedaite.
+  int EltSizeArg = eltSizeArg;
+  ImmCheckType Kind = kind;
+}
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 3098fa67e6a51..ee823f6ef6813 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -284,14 +284,18 @@ def OP_CVT_F32_BF16
 
 // Splat operation - performs a range-checked splat over a vector
 def SPLAT  : WInst<"splat_lane", ".(!q)I",
-                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl">;
+                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl", 
+                    [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
 def SPLATQ : WInst<"splat_laneq", ".(!Q)I",
-                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> {
+                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl", 
+                   [ImmCheck<1, ImmCheckLaneQIndex, 1>]> {
   let isLaneQ = 1;
 }
 let TargetGuard = "bf16,neon" in {
-  def SPLAT_BF  : WInst<"splat_lane", ".(!q)I", "bQb">;
-  def SPLATQ_BF : WInst<"splat_laneq", ".(!Q)I", "bQb"> {
+  def SPLAT_BF  : WInst<"splat_lane", ".(!q)I", "bQb", 
+                      [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+  def SPLATQ_BF : WInst<"splat_laneq", ".(!Q)I", "bQb", 
+                      [ImmCheck<1, ImmCheckLaneQIndex, 1>]> {
     let isLaneQ = 1;
   }
 }
@@ -401,27 +405,45 @@ def VQRSHL : SInst<"vqrshl", "..S", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.12 Shifts by constant
 let isShift = 1 in {
-def VSHR_N     : SInst<"vshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VSHL_N     : IInst<"vshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VRSHR_N    : SInst<"vrshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VSRA_N     : SInst<"vsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VRSRA_N    : SInst<"vrsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VQSHL_N    : SInst<"vqshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VQSHLU_N   : SInst<"vqshlu_n", "U.I", "csilQcQsQiQl">;
-def VSHRN_N    : IInst<"vshrn_n", "<QI", "silUsUiUl">;
-def VQSHRUN_N  : SInst<"vqshrun_n", "(<U)QI", "sil">;
-def VQRSHRUN_N : SInst<"vqrshrun_n", "(<U)QI", "sil">;
-def VQSHRN_N   : SInst<"vqshrn_n", "<QI", "silUsUiUl">;
-def VRSHRN_N   : IInst<"vrshrn_n", "<QI", "silUsUiUl">;
-def VQRSHRN_N  : SInst<"vqrshrn_n", "<QI", "silUsUiUl">;
-def VSHLL_N    : SInst<"vshll_n", "(>Q).I", "csiUcUsUi">;
+
+
+def VSHR_N     : SInst<"vshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", 
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VSHL_N     : IInst<"vshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", 
+                      [ImmCheck<1, ImmCheckShiftLeft>]>;
+def VRSHR_N    : SInst<"vrshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", 
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VSRA_N     : SInst<"vsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", 
+                      [ImmCheck<2, ImmCheckShiftRight>]>;
+def VRSRA_N    : SInst<"vrsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", 
+                      [ImmCheck<2, ImmCheckShiftRight>]>;
+def VQSHL_N    : SInst<"vqshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", 
+                      [ImmCheck<1, ImmCheckShiftLeft>]>;
+def VQSHLU_N   : SInst<"vqshlu_n", "U.I", "csilQcQsQiQl", 
+                      [ImmCheck<1, ImmCheckShiftLeft>]>;
+def VSHRN_N    : IInst<"vshrn_n", "<QI", "silUsUiUl", 
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VQSHRUN_N  : SInst<"vqshrun_n", "(<U)QI", "sil", 
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VQRSHRUN_N : SInst<"vqrshrun_n", "(<U)QI", "sil", 
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VQSHRN_N   : SInst<"vqshrn_n", "<QI", "silUsUiUl", 
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VRSHRN_N   : IInst<"vrshrn_n", "<QI", "silUsUiUl", 
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VQRSHRN_N  : SInst<"vqrshrn_n", "<QI", "silUsUiUl", 
+                      [ImmCheck<1, ImmCheckShiftRight>]>;
+def VSHLL_N    : SInst<"vshll_n", "(>Q).I", "csiUcUsUi", 
+                      [ImmCheck<1, ImmCheckShiftLeft>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.13 Shifts with insert
 def VSRI_N : WInst<"vsri_n", "...I",
-                   "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">;
+                   "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs", 
+                    [ImmCheck<2, ImmCheckShiftRight>]>;
 def VSLI_N : WInst<"vsli_n", "...I",
-                   "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">;
+                   "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs", 
+                   [ImmCheck<2, ImmCheckShiftLeft>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -435,7 +457,8 @@ def VLD1_X3   : WInst<"vld1_x3", "3(c*!)",
 def VLD1_X4   : WInst<"vld1_x4", "4(c*!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
 def VLD1_LANE : WInst<"vld1_lane", ".(c*!).I",
-                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
+                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs", 
+                      [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 def VLD1_DUP  : WInst<"vld1_dup", ".(c*!)",
                       "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
 def VST1      : WInst<"vst1", "v*(.!)",
@@ -447,19 +470,23 @@ def VST1_X3   : WInst<"vst1_x3", "v*(3!)",
 def VST1_X4   : WInst<"vst1_x4", "v*(4!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
 def VST1_LANE : WInst<"vst1_lane", "v*(.!)I",
-                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
+                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs", 
+                      [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+
 let ArchGuard = "(__ARM_FP & 2)" in {
 def VLD1_F16      : WInst<"vld1", ".(c*!)", "hQh">;
 def VLD1_X2_F16   : WInst<"vld1_x2", "2(c*!)", "hQh">;
 def VLD1_X3_F16   : WInst<"vld1_x3", "3(c*!)", "hQh">;
 def VLD1_X4_F16   : WInst<"vld1_x4", "4(c*!)", "hQh">;
-def VLD1_LANE_F16 : WInst<"vld1_lane", ".(c*!).I", "hQh">;
+def VLD1_LANE_F16 : WInst<"vld1_lane", ".(c*!).I", "hQh", 
+                          [ImmCheck<2, ImmCheck0_3, 1>]>;
 def VLD1_DUP_F16  : WInst<"vld1_dup", ".(c*!)", "hQh">;
 def VST1_F16      : WInst<"vst1", "v*(.!)", "hQh">;
 def VST1_X2_F16   : WInst<"vst1_x2", "v*(2!)", "hQh">;
 def VST1_X3_F16   : WInst<"vst1_x3", "v*(3!)", "hQh">;
 def VST1_X4_F16   : WInst<"vst1_x4", "v*(4!)", "hQh">;
-def VST1_LANE_F16 : WInst<"vst1_lane", "v*(.!)I", "hQh">;
+def VST1_LANE_F16 : WInst<"vst1_lane", "v*(.!)I", "hQh", 
+                          [ImmCheck<2, ImmCheck0_3, 1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -473,15 +500,21 @@ def VLD3_DUP  : WInst<"vld3_dup", "3(c*!)",
                       "UcUsUiUlcsilfPcPsQcQfQiQlQsQPcQPsQUcQUiQUlQUs">;
 def VLD4_DUP  : WInst<"vld4_dup", "4(c*!)",
                       "UcUsUiUlcsilfPcPsQcQfQiQlQsQPcQPsQUcQUiQUlQUs">;
-def VLD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VLD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VLD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VLD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs", 
+                      [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def VLD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs", 
+                      [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
+def VLD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs", 
+                      [ImmCheck<6, ImmCheckLaneIndex, 1>]>;
 def VST2 : WInst<"vst2", "v*(2!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
 def VST3 : WInst<"vst3", "v*(3!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
 def VST4 : WInst<"vst4", "v*(4!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST2_LANE : WInst<"vst2_lane", "v*(2!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VST3_LANE : WInst<"vst3_lane", "v*(3!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VST4_LANE : WInst<"vst4_lane", "v*(4!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VST2_LANE : WInst<"vst2_lane", "v*(2!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs", 
+                      [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+def VST3_LANE : WInst<"vst3_lane", "v*(3!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs", 
+                      [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def VST4_LANE : WInst<"vst4_lane", "v*(4!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs", 
+                      [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
 let ArchGuard = "(__ARM_FP & 2)" in {
 def VLD2_F16      : WInst<"vld2", "2(c*!)", "hQh">;
 def VLD3_F16      : WInst<"vld3", "3(c*!)", "hQh">;
@@ -489,28 +522,36 @@ def VLD4_F16      : WInst<"vld4", "4(c*!)", "hQh">;
 def VLD2_DUP_F16  : WInst<"vld2_dup", "2(c*!)", "hQh">;
 def VLD3_DUP_F16  : WInst<"vld3_dup", "3(c*!)", "hQh">;
 def VLD4_DUP_F16  : WInst<"vld4_dup", "4(c*!)", "hQh">;
-def VLD2_LANE_F16 : WInst<"vld2_lane", "2(c*!)2I", "hQh">;
-def VLD3_LANE_F16 : WInst<"vld3_lane", "3(c*!)3I", "hQh">;
-def VLD4_LANE_F16 : WInst<"vld4_lane", "4(c*!)4I", "hQh">;
+def VLD2_LANE_F16 : WInst<"vld2_lane", "2(c*!)2I", "hQh", 
+                          [ImmCheck<4, ImmCheck0_3, 1>]>;
+def VLD3_LANE_F16 : WInst<"vld3_lane", "3(c*!)3I", "hQh", 
+                          [ImmCheck<5, ImmCheck0_3, 1>]>;
+def VLD4_LANE_F16 : WInst<"vld4_lane", "4(c*!)4I", "hQh", 
+                          [ImmCheck<6, ImmCheck0_3, 1>]>;
 def VST2_F16      : WInst<"vst2", "v*(2!)", "hQh">;
 def VST3_F16      : WInst<"vst3", "v*(3!)", "hQh">;
 def VST4_F16      : WInst<"vst4", "v*(4!)", "hQh">;
-def VST2_LANE_F16 : WInst<"vst2_lane", "v*(2!)I", "hQh">;
-def VST3_LANE_F16 : WInst<"vst3_lane", "v*(3!)I", "hQh">;
-def VST4_LANE_F16 : WInst<"vst4_lane", "v*(4!)I", "hQh">;
+def VST2_LANE_F16 : WInst<"vst2_lane", "v*(2!)I", "hQh", 
+                          [ImmCheck<3, ImmCheck0_3, 1>]>;
+def VST3_LANE_F16 : WInst<"vst3_lane", "v*(3!)I", "hQh", 
+                         [ImmCheck<4, ImmCheck0_3, 1>]>;
+def VST4_LANE_F16 : WInst<"vst4_lane", "v*(4!)I", "hQh", 
+                          [ImmCheck<5, ImmCheck0_3, 1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.16 Extract lanes from a vector
 let InstName = "vmov" in
 def VGET_LANE : IInst<"vget_lane", "1.I",
-                      "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
+                      "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl", 
+                      [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.17 Set lanes within a vector
 let InstName = "vmov" in
 def VSET_LANE : IInst<"vset_lane", ".1.I",
-                      "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
+                      "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl", 
+                      [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.18 Initialize a vector from bit pattern
@@ -560,9 +601,12 @@ def VCVT_S32     : SInst<"vcvt_s32", "S.",  "fQf">;
 def VCVT_U32     : SInst<"vcvt_u32", "U.",  "fQf">;
 def VCVT_F32     : SInst<"vcvt_f32", "F(.!)",  "iUiQiQUi">;
 let isVCVT_N = 1 in {
-def VCVT_N_S32   : SInst<"vcvt_n_s32", "S.I", "fQf">;
-def VCVT_N_U32   : SInst<"vcvt_n_u32", "U.I", "fQf">;
-def VCVT_N_F32   : SInst<"vcvt_n_f32", "F(.!)I", "iUiQiQUi">;
+def VCVT_N_S32   : SInst<"vcvt_n_s32", "S.I", "fQf", 
+                        [ImmCheck<1, ImmCheck1_32>]>;
+def VCVT_N_U32   : SInst<"vcvt_n_u32", "U.I", "fQf", 
+                        [ImmCheck<1, ImmCheck1_32>]>;
+def VCVT_N_F32   : SInst<"vcvt_n_f32", "F(.!)I", "iUiQiQUi", 
+                        [ImmCheck<1, ImmCheck1_32>]>;
 }
 
 def VMOVN        : IInst<"vmovn", "<Q",  "silUsUiUl">;
@@ -610,8 +654,10 @@ def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
 def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
 }
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
-def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi">;
-def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi">;
+def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi", 
+                              [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
+def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi", 
+                              [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 }
 
 let TargetGuard = "v8.1a,neon" in {
@@ -629,7 +675,8 @@ def VQDMLSL_N     : SOpInst<"vqdmlsl_n", "(>Q)(>Q).1", "si", OP_QDMLSL_N>;
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.26 Vector Extract
 def VEXT : WInst<"vext", "...I",
-                 "cUcPcsUsPsiUilUlfQcQUcQPcQsQUsQPsQiQUiQlQUlQf">;
+                 "cUcPcsUsPsiUilUlfQcQUcQPcQsQUsQPsQiQUiQlQUlQf", 
+                 [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.27 Reverse vector elements
@@ -738,14 +785,22 @@ def ST1_X2 : WInst<"vst1_x2", "v*(2!)", "dQdPlQPl">;
 def ST1_X3 : WInst<"vst1_x3", "v*(3!)", "dQdPlQPl">;
 def ST1_X4 : WInst<"vst1_x4", "v*(4!)", "dQdPlQPl">;
 
-def LD1_LANE : WInst<"vld1_lane", ".(c*!).I", "dQdPlQPl">;
-def LD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def LD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def LD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def ST1_LANE : WInst<"vst1_lane", "v*(.!)I", "dQdPlQPl">;
-def ST2_LANE : WInst<"vst2_lane", "v*(2!)I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def ST3_LANE : WInst<"vst3_lane", "v*(3!)I", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def ST4_LANE : WInst<"vst4_lane", "v*(4!)I", "lUlQcQUcQPcQlQUldQdPlQPl">;
+def LD1_LANE : WInst<"vld1_lane", ".(c*!).I", "dQdPlQPl", 
+                    [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def LD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "lUlQcQUcQPcQlQUldQdPlQPl", 
+                    [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def LD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "lUlQcQUcQPcQlQUldQdPlQPl", 
+                    [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
+def LD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "lUlQcQUcQPcQlQUldQdPlQPl", 
+                    [ImmCheck<6, ImmCheckLaneIndex, 1>]>;
+def ST1_LANE : WInst<"vst1_lane", "v*(.!)I", "dQdPlQPl", 
+                    [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def ST2_LANE : WInst<"vst2_lane", "v*(2!)I", "lUlQcQUcQPcQlQUldQdPlQPl", 
+                    [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+def ST3_LANE : WInst<"vst3_lane", "v*(3!)I", "lUlQcQUcQPcQlQUldQdPlQPl", 
+                    [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+def ST4_LANE : WInst<"vst4_lane", "v*(4!)I", "lUlQcQUcQPcQlQUldQdPlQPl", 
+                    [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
 
 def LD1_DUP  : WInst<"vld1_dup", ".(c*!)", "dQdPlQPl">;
 def LD2_DUP  : WInst<"vld2_dup", "2(c*!)", "dQdPlQPl">;
@@ -901,8 +956,8 @@ def SHLL_HIGH_N    : SOpInst<"vshll_high_n", ">.I", "HcHsHiHUcHUsHUi",
                              OP_LONG_HI>;
 
 ////////////////////////////////////////////////////////////////////////////////
-def SRI_N : WInst<"vsri_n", "...I", "PlQPl">;
-def SLI_N : WInst<"vsli_n", "...I", "PlQPl">;
+def SRI_N : WInst<"vsri_n", "...I", "PlQPl", [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SLI_N : WInst<"vsli_n", "...I", "PlQPl", [ImmCheck<2, ImmCheckShiftLeft, 1>]>;
 
 // Right shift narrow high
 def SHRN_HIGH_N    : IOpInst<"vshrn_high_n", "<(<q).I",
@@ -924,9 +979,12 @@ def QRSHRN_HIGH_N  : SOpInst<"vqrshrn_high_n", "<(<q).I",
 def VMOVL_HIGH   : SOpInst<"vmovl_high", ">.", "HcHsHiHUcHUsHUi", OP_MOVL_HI>;
 
 let isVCVT_N = 1 in {
-def CVTF_N_F64   : SInst<"vcvt_n_f64", "F(.!)I", "lUlQlQUl">;
-def FCVTZS_N_S64 : SInst<"vcvt_n_s64", "S.I", "dQd">;
-def FCVTZS_N_U64 : SInst<"vcvt_n_u64", "U.I", "dQd">;
+def CVTF_N_F64   : SInst<"vcvt_n_f64", "F(.!)I", "lUlQlQUl", 
+                        [ImmCheck<1, ImmCheck1_64>]>;
+def FCVTZS_N_S64 : SInst<"vcvt_n_s64", "S.I", "dQd", 
+                        [ImmCheck<1, ImmCheck1_64>]>;
+def FCVTZS_N_U64 : SInst<"vcvt_n_u64", "U.I", "dQd", 
+                        [ImmCheck<1, ImmCheck1_64>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -965,8 +1023,10 @@ let TargetGuard = "aes,neon" in {
 
 ////////////////////////////////////////////////////////////////////////////////
 // Extract or insert element from vector
-def GET_LANE : IInst<"vget_lane", "1.I", "dQdPlQPl">;
-def SET_LANE : IInst<"vset_lane", ".1.I", "dQdPlQPl">;
+def GET_LANE : IInst<"vget_lane", "1.I", "dQdPlQPl", 
+                      [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+def SET_LANE : IInst<"vset_lane", ".1.I", "dQdPlQPl", 
+                      [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 def COPY_LANE : IOpInst<"vcopy_lane", "..I.I",
                         "csilUcUsUiUlPcPsPlfd", OP_COPY_LN>;
 def COPYQ_LANE : IOpInst<"vcopy_lane", "..IqI",
@@ -1011,8 +1071,10 @@ def VMLS_LANEQ   : IOpInst<"vmls_laneq", "...QI",
   let isLaneQ = 1;
 }
 
-def VFMA_LANE    : IInst<"vfma_lane", "...qI", "fdQfQd">;
-def VFMA_LANEQ   : IInst<"vfma_laneq", "...QI", "fdQfQd"> {
+def VFMA_LANE    : IInst<"vfma_lane", "...qI", "fdQfQd", 
+                        [ImmCheck<3, ImmCheckLaneIndex, 0>]>;
+def VFMA_LANEQ   : IInst<"vfma_laneq", "...QI", "fdQfQd", 
+                        [ImmCheck<3, ImmCheckLaneQIndex, 2>]> {
   let isLaneQ = 1;
 }
 def VFMS_LANE    : IOpInst<"vfms_lane", "...qI", "fdQfQd", OP_FMS_LN>;
@@ -1088,8 +1150,10 @@ def VQDMULL_HIGH_LANEQ  : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
 }
 
 let isLaneQ = 1 in {
-def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
-def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
+def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi", 
+                          [ImmCheck<2, ImmCheckLaneQIndex, 1>]>;
+def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi", 
+                          [ImmCheck<2, ImmCheckLaneQIndex, 1>]>;
 }
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a,neon" in {
 def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> {
@@ -1118,7 +1182,8 @@ def FMINNMV : SInst<"vminnmv", "1.", "fQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Newly added Vector Extract for f64
-def VEXT_A64 : WInst<"vext", "...I", "dQdPlQPl">;
+def VEXT_A64 : WInst<"vext", "...I", "dQdPlQPl", 
+                    [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Crypto
@@ -1149,7 +1214,7 @@ def EOR3 : SInst<"veor3", "....", "QUcQUsQUiQUlQcQsQiQl">;
 def RAX1 : SInst<"vrax1", "...", "QUl">;
 
 let isVXAR = 1 in {
-def XAR :  SInst<"vxar", "...I", "QUl">;
+def XAR :  SInst<"vxar", "...I", "QUl", [ImmCheck<2, ImmCheck0_63>]>;
 }
 }
 
@@ -1162,10 +1227,10 @@ def SHA512H2 : SInst<"vsha512h2", "....", "QUl">;
 
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4,neon" in {
 def SM3SS1 : SInst<"vsm3ss1", "....", "QUi">;
-def SM3TT1A : SInst<"vsm3tt1a", "....I", "QUi">;
-def SM3TT1B : SInst<"vsm3tt1b", "....I", "QUi">;
-def SM3TT2A : SInst<"vsm3tt2a", "....I", "QUi">;
-def SM3TT2B : SInst<"vsm3tt2b", "....I", "QUi">;
+def SM3TT1A : SInst<"vsm3tt1a", "....I", "QUi", [ImmCheck<3, ImmCheck0_3>]>;
+def SM3TT1B : SInst<"vsm3tt1b", "....I", "QUi", [ImmCheck<3, ImmCheck0_3>]>;
+def SM3TT2A : SInst<"vsm3tt2a", "....I", "QUi", [ImmCheck<3, ImmCheck0_3>]>;
+def SM3TT2B : SInst<"vsm3tt2b", "....I", "QUi", [ImmCheck<3, ImmCheck0_3>]>;
 def SM3PARTW1 : SInst<"vsm3partw1", "....", "QUi">;
 def SM3PARTW2 : SInst<"vsm3partw2", "....", "QUi">;
 }
@@ -1327,49 +1392,68 @@ def SCALAR_RSHL: SInst<"vrshl", "11(S1)", "SlSUl">;
 // Scalar Shift (Immediate)
 let isScalarShift = 1 in {
 // Signed/Unsigned Shift Right (Immediate)
-def SCALAR_SSHR_N: SInst<"vshr_n", "11I", "SlSUl">;
+def SCALAR_SSHR_N: SInst<"vshr_n", "11I", "SlSUl", 
+                        [ImmCheck<1, ImmCheckShiftRight, 0>]>;
 // Signed/Unsigned Rounding Shift Right (Immediate)
-def SCALAR_SRSHR_N: SInst<"vrshr_n", "11I", "SlSUl">;
+def SCALAR_SRSHR_N: SInst<"vrshr_n", "11I", "SlSUl", 
+                          [ImmCheck<1, ImmCheckShiftRight, 0>]>;
 
 // Signed/Unsigned Shift Right and Accumulate (Immediate)
-def SCALAR_SSRA_N: SInst<"vsra_n", "111I", "SlSUl">;
+def SCALAR_SSRA_N: SInst<"vsra_n", "111I", "SlSUl", 
+                        [ImmCheck<2, ImmCheckShiftRight, 0>]>;
 // Signed/Unsigned Rounding Shift Right and Accumulate (Immediate)
-def SCALAR_SRSRA_N: SInst<"vrsra_n", "111I", "SlSUl">;
+def SCALAR_SRSRA_N: SInst<"vrsra_n", "111I", "SlSUl", 
+                        [ImmCheck<2, ImmCheckShiftRight, 0>]>;
 
 // Shift Left (Immediate)
-def SCALAR_SHL_N: SInst<"vshl_n", "11I", "SlSUl">;
+def SCALAR_SHL_N: SInst<"vshl_n", "11I", "SlSUl", 
+                      [ImmCheck<1, ImmCheckShiftLeft, 0>]>;
 // Signed/Unsigned Saturating Shift Left (Immediate)
-def SCALAR_SQSHL_N: SInst<"vqshl_n", "11I", "ScSsSiSlSUcSUsSUiSUl">;
+def SCALAR_SQSHL_N: SInst<"vqshl_n", "11I", "ScSsSiSlSUcSUsSUiSUl", 
+                      [ImmCheck<1, ImmCheckShiftLeft, 0>]>;
 // Signed Saturating Shift Left Unsigned (Immediate)
-def SCALAR_SQSHLU_N: SInst<"vqshlu_n", "11I", "ScSsSiSl">;
+def SCALAR_SQSHLU_N: SInst<"vqshlu_n", "11I", "ScSsSiSl", 
+                      [ImmCheck<1, ImmCheckShiftLeft, 0>]>;
 
 // Shift Right And Insert (Immediate)
-def SCALAR_SRI_N: SInst<"vsri_n", "111I", "SlSUl">;
+def SCALAR_SRI_N: SInst<"vsri_n", "111I", "SlSUl", 
+                        [ImmCheck<2, ImmCheckShiftRight, 0>]>;
 // Shift Left And Insert (Immediate)
-def SCALAR_SLI_N: SInst<"vsli_n", "111I", "SlSUl">;
+def SCALAR_SLI_N: SInst<"vsli_n", "111I", "SlSUl", 
+                        [ImmCheck<2, ImmCheckShiftLeft, 0>]>;
 
 let isScalarNarrowShift = 1 in {
   // Signed/Unsigned Saturating Shift Right Narrow (Immediate)
-  def SCALAR_SQSHRN_N: SInst<"vqshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl">;
+  def SCALAR_SQSHRN_N: SInst<"vqshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl", 
+                            [ImmCheck<1, ImmCheckShiftRight, 0>]>;
   // Signed/Unsigned Saturating Rounded Shift Right Narrow (Immediate)
-  def SCALAR_SQRSHRN_N: SInst<"vqrshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl">;
+  def SCALAR_SQRSHRN_N: SInst<"vqrshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl", 
+                            [ImmCheck<1, ImmCheckShiftRight, 0>]>;
   // Signed Saturating Shift Right Unsigned Narrow (Immediate)
-  def SCALAR_SQSHRUN_N: SInst<"vqshrun_n", "(1<U)1I", "SsSiSl">;
+  def SCALAR_SQSHRUN_N: SInst<"vqshrun_n", "(1<U)1I", "SsSiSl", 
+                            [ImmCheck<1, ImmCheckShiftRight, 0>]>;
   // Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
-  def SCALAR_SQRSHRUN_N: SInst<"vqrshrun_n", "(1<U)1I", "SsSiSl">;
+  def SCALAR_SQRSHRUN_N: SInst<"vqrshrun_n", "(1<U)1I", "SsSiSl", 
+                            [ImmCheck<1, ImmCheckShiftRight, 0>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Signed/Unsigned Fixed-point Convert To Floating-Point (Immediate)
-def SCALAR_SCVTF_N_F32: SInst<"vcvt_n_f32", "(1F)(1!)I", "SiSUi">;
-def SCALAR_SCVTF_N_F64: SInst<"vcvt_n_f64", "(1F)(1!)I", "SlSUl">;
+def SCALAR_SCVTF_N_F32: SInst<"vcvt_n_f32", "(1F)(1!)I", "SiSUi", 
+                              [ImmCheck<1, ImmCheck1_32>]>;
+def SCALAR_SCVTF_N_F64: SInst<"vcvt_n_f64", "(1F)(1!)I", "SlSUl", 
+                              [ImmCheck<1, ImmCheck1_64>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Convert To Signed/Unsigned Fixed-point (Immediate)
-def SCALAR_FCVTZS_N_S32 : SInst<"vcvt_n_s32", "(1S)1I", "Sf">;
-def SCALAR_FCVTZU_N_U32 : SInst<"vcvt_n_u32", "(1U)1I", "Sf">;
-def SCALAR_FCVTZS_N_S64 : SInst<"vcvt_n_s64", "(1S)1I", "Sd">;
-def SCALAR_FCVTZU_N_U64 : SInst<"vcvt_n_u64", "(1U)1I", "Sd">;
+def SCALAR_FCVTZS_N_S32 : SInst<"vcvt_n_s32", "(1S)1I", "Sf", 
+                                [ImmCheck<1, ImmCheck1_32>]>;
+def SCALAR_FCVTZU_N_U32 : SInst<"vcvt_n_u32", "(1U)1I", "Sf", 
+                                [ImmCheck<1, ImmCheck1_32>]>;
+def SCALAR_FCVTZS_N_S64 : SInst<"vcvt_n_s64", "(1S)1I", "Sd", 
+                                [ImmCheck<1, ImmCheck1_64>]>;
+def SCALAR_FCVTZU_N_U64 : SInst<"vcvt_n_u64", "(1U)1I", "Sd", 
+                                [ImmCheck<1, ImmCheck1_64>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1575,10 +1659,12 @@ def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_L
 def SCALAR_VMUL_N : IInst<"vmul_n", "..1", "d">;
 
 // VMUL_LANE_A64 d type implemented using scalar mul lane
-def SCALAR_VMUL_LANE : IInst<"vmul_lane", "..qI", "d">;
+def SCALAR_VMUL_LANE : IInst<"vmul_lane", "..qI", "d", 
+                            [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 
 // VMUL_LANEQ d type implemented using scalar mul lane
-def SCALAR_VMUL_LANEQ   : IInst<"vmul_laneq", "..QI", "d"> {
+def SCALAR_VMUL_LANEQ : IInst<"vmul_laneq", "..QI", "d", 
+                              [ImmCheck<2, ImmCheckLaneQIndex, 1>]> {
   let isLaneQ = 1;
 }
 
@@ -1591,8 +1677,10 @@ def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ
 }
 
 // Scalar Floating Point fused multiply-add (scalar, by element)
-def SCALAR_FMLA_LANE : IInst<"vfma_lane", "111.I", "SfSd">;
-def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd"> {
+def SCALAR_FMLA_LANE : IInst<"vfma_lane", "111.I", "SfSd", 
+                            [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd", 
+                            [ImmCheck<3, ImmCheckLaneQIndex, 2>]> {
   let isLaneQ = 1;
 }
 
@@ -1609,14 +1697,18 @@ def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR
 }
 
 // Signed Saturating Doubling Multiply-Add Long (scalar by element)
-def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "(1>)(1>)1.I", "SsSi">;
-def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi"> {
+def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "(1>)(1>)1.I", "SsSi", 
+                                [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi", 
+                                [ImmCheck<3, ImmCheckLaneQIndex, 2>]> {
   let isLaneQ = 1;
 }
 
 // Signed Saturating Doubling Multiply-Subtract Long (scalar by element)
-def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "(1>)(1>)1.I", "SsSi">;
-def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi"> {
+def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "(1>)(1>)1.I", "SsSi", 
+                              [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi", 
+                              [ImmCheck<3, ImmCheckLaneQIndex, 2>]> {
   let isLaneQ = 1;
 }
 
@@ -1646,8 +1738,10 @@ def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR
 }
 } // TargetGuard = "v8.1a"
 
-def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
-def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs"> {
+def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs", 
+                            [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs", 
+                            [ImmCheck<1, ImmCheckLaneQIndex, 0>]> {
   let isLaneQ = 1;
 }
 
@@ -1720,9 +1814,12 @@ let TargetGuard = "fullfp16,neon" in {
 
   // Vector conversion
   let isVCVT_N = 1 in {
-    def VCVT_N_F16 : SInst<"vcvt_n_f16", "F(.!)I", "sUsQsQUs">;
-    def VCVT_N_S16 : SInst<"vcvt_n_s16", "S.I", "hQh">;
-    def VCVT_N_U16 : SInst<"vcvt_n_u16", "U.I", "hQh">;
+    def VCVT_N_F16 : SInst<"vcvt_n_f16", "F(.!)I", "sUsQsQUs", 
+                          [ImmCheck<1, ImmCheck1_16>]>;
+    def VCVT_N_S16 : SInst<"vcvt_n_s16", "S.I", "hQh", 
+                          [ImmCheck<1, ImmCheck1_16>]>;
+    def VCVT_N_U16 : SInst<"vcvt_n_u16", "U.I", "hQh", 
+                          [ImmCheck<1, ImmCheck1_16>]>;
   }
 
   // Max/Min
@@ -1770,7 +1867,7 @@ def VZIPH    : WInst<"vzip", "2..", "hQh">;
 def VUZPH    : WInst<"vuzp", "2..", "hQh">;
 def VTRNH    : WInst<"vtrn", "2..", "hQh">;
 // Vector Extract
-def VEXTH      : WInst<"vext", "...I", "hQh">;
+def VEXTH      : WInst<"vext", "...I", "hQh", [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 // Reverse vector elements
 def VREV64H    : WOpInst<"vrev64", "..", "hQh", OP_REV64>;
 
@@ -1801,16 +1898,20 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   // ARMv8.2-A FP16 lane vector intrinsics.
 
   // FMA lane
-  def VFMA_LANEH   : IInst<"vfma_lane", "...qI", "hQh">;
-  def VFMA_LANEQH  : IInst<"vfma_laneq", "...QI", "hQh"> {
+  def VFMA_LANEH   : IInst<"vfma_lane", "...qI", "hQh", 
+                          [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+  def VFMA_LANEQH  : IInst<"vfma_laneq", "...QI", "hQh", 
+                          [ImmCheck<3, ImmCheckLaneQIndex, 2>]> {
     let isLaneQ = 1;
   }
 
   // FMA lane with scalar argument
   def FMLA_NH      : SOpInst<"vfma_n", "...1", "hQh", OP_FMLA_N>;
   // Scalar floating point fused multiply-add (scalar, by element)
-  def SCALAR_FMLA_LANEH  : IInst<"vfma_lane", "111.I", "Sh">;
-  def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh"> {
+  def SCALAR_FMLA_LANEH  : IInst<"vfma_lane", "111.I", "Sh", 
+                                [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+  def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh", 
+                                [ImmCheck<3, ImmCheckLaneQIndex, 2>]> {
     let isLaneQ = 1;
   }
 
@@ -1844,8 +1945,10 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   }
   def VMULX_NH      : IOpInst<"vmulx_n", "..1", "hQh", OP_MULX_N>;
   // Scalar floating point  mulx (scalar, by element)
-  def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "11.I", "Sh">;
-  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh"> {
+  def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "11.I", "Sh", 
+                                [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh", 
+                                [ImmCheck<2, ImmCheckLaneQIndex, 1>]> {
     let isLaneQ = 1;
   }
 
@@ -1865,8 +1968,10 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
   def VZIP2H     : SOpInst<"vzip2", "...", "hQh", OP_ZIP2>;
   def VUZP2H     : SOpInst<"vuzp2", "...", "hQh", OP_UZP2>;
 
-  def SCALAR_VDUP_LANEH  : IInst<"vdup_lane", "1.I", "Sh">;
-  def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh"> {
+  def SCALAR_VDUP_LANEH  : IInst<"vdup_lane", "1.I", "Sh", 
+                                [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+  def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh", 
+                                [ImmCheck<1, ImmCheckLaneQIndex, 0>]> {
     let isLaneQ = 1;
   }
 }
@@ -1959,9 +2064,12 @@ multiclass VCMLA_ROTS<string type, string lanety, string laneqty> {
 
     let isLaneQ = 1 in  {
       // vcmla{ROT}_laneq
+      // ACLE specifies that the fp16 vcmla_#ROT_laneq variant has an immedaite range of 0 <= lane <= 1.
+      // fp16 is the only variant for which these two differ.
+      // https://developer.arm.com/documentation/ihi0073/latest/ 
+      defvar getlanety = !if(!eq(type, "h"), lanety, laneqty);
       def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", type,  Op<(call "vcmla" # ROT, $p0, $p1,
-              (bitcast $p0, (dup_typed lanety, (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>;
-
+                (bitcast $p0, (dup_typed lanety, (call "vget_lane", (bitcast getlanety, $p2), $p3))))>>;
       // vcmlaq{ROT}_laneq
       def : SOpInst<"vcmla" # ROT # "_laneq", "...QI", "Q" # type, Op<(call "vcmla" # ROT, $p0, $p1,
              (bitcast $p0, (dup_typed laneqty , (call "vget_lane", (bitcast laneqty, $p2), $p3))))>>;
@@ -2011,10 +2119,14 @@ let TargetGuard = "bf16,neon" in {
   def VGET_HIGH_BF : NoTestOpInst<"vget_high", ".Q", "b", OP_HI>;
   def VGET_LOW_BF  : NoTestOpInst<"vget_low", ".Q", "b", OP_LO>;
 
-  def VGET_LANE_BF : IInst<"vget_lane", "1.I", "bQb">;
-  def VSET_LANE_BF : IInst<"vset_lane", ".1.I", "bQb">;
-  def SCALAR_VDUP_LANE_BF : IInst<"vdup_lane", "1.I", "Sb">;
-  def SCALAR_VDUP_LANEQ_BF : IInst<"vdup_laneq", "1QI", "Sb"> {
+  def VGET_LANE_BF : IInst<"vget_lane", "1.I", "bQb", 
+                          [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+  def VSET_LANE_BF : IInst<"vset_lane", ".1.I", "bQb", 
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def SCALAR_VDUP_LANE_BF : IInst<"vdup_lane", "1.I", "Sb", 
+                          [ImmCheck<1, ImmCheckLaneIndex, 0>]>;
+  def SCALAR_VDUP_LANEQ_BF : IInst<"vdup_laneq", "1QI", "Sb", 
+                          [ImmCheck<1, ImmCheckLaneQIndex, 0>]> {
     let isLaneQ = 1;
   }
 
@@ -2036,14 +2148,22 @@ let TargetGuard = "bf16,neon" in {
   def VST1_X3_BF : WInst<"vst1_x3", "v*(3!)", "bQb">;
   def VST1_X4_BF : WInst<"vst1_x4", "v*(4!)", "bQb">;
 
-  def VLD1_LANE_BF : WInst<"vld1_lane", ".(c*!).I", "bQb">;
-  def VLD2_LANE_BF : WInst<"vld2_lane", "2(c*!)2I", "bQb">;
-  def VLD3_LANE_BF : WInst<"vld3_lane", "3(c*!)3I", "bQb">;
-  def VLD4_LANE_BF : WInst<"vld4_lane", "4(c*!)4I", "bQb">;
-  def VST1_LANE_BF : WInst<"vst1_lane", "v*(.!)I", "bQb">;
-  def VST2_LANE_BF : WInst<"vst2_lane", "v*(2!)I", "bQb">;
-  def VST3_LANE_BF : WInst<"vst3_lane", "v*(3!)I", "bQb">;
-  def VST4_LANE_BF : WInst<"vst4_lane", "v*(4!)I", "bQb">;
+  def VLD1_LANE_BF : WInst<"vld1_lane", ".(c*!).I", "bQb", 
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def VLD2_LANE_BF : WInst<"vld2_lane", "2(c*!)2I", "bQb", 
+                          [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+  def VLD3_LANE_BF : WInst<"vld3_lane", "3(c*!)3I", "bQb", 
+                          [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
+  def VLD4_LANE_BF : WInst<"vld4_lane", "4(c*!)4I", "bQb", 
+                          [ImmCheck<6, ImmCheckLaneIndex, 1>]>;
+  def VST1_LANE_BF : WInst<"vst1_lane", "v*(.!)I", "bQb", 
+                          [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def VST2_LANE_BF : WInst<"vst2_lane", "v*(2!)I", "bQb", 
+                          [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+  def VST3_LANE_BF : WInst<"vst3_lane", "v*(3!)I", "bQb",
+                          [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+  def VST4_LANE_BF : WInst<"vst4_lane", "v*(4!)I", "bQb", 
+                          [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
 
   def VLD1_DUP_BF : WInst<"vld1_dup", ".(c*!)", "bQb">;
   def VLD2_DUP_BF : WInst<"vld2_dup", "2(c*!)", "bQb">;
@@ -2093,6 +2213,8 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "b
 
 // v8.9a/v9.4a LRCPC3 intrinsics
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "rcpc3,neon" in {
-  def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
-  def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
+  def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl",
+                        [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl", 
+                        [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 }
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index 3b8015daee6d9..2b5acd41e7bbd 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -21,6 +21,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+include "arm_immcheck_incl.td"
+
 // The base Operation class. All operations must subclass this.
 class Operation<list<dag> ops=[]> {
   list<dag> Ops = ops;
@@ -260,7 +262,7 @@ def OP_UNAVAILABLE : Operation {
 
 
 // Every intrinsic subclasses Inst.
-class Inst <string n, string p, string t, Operation o> {
+class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
   string Name = n;
   string Prototype = p;
   string Types = t;
@@ -278,6 +280,7 @@ class Inst <string n, string p, string t, Operation o> {
   // a Q register. Only used for intrinsics which end up calling polymorphic
   // builtins.
   bit isLaneQ = 0;
+  list<ImmCheck> ImmChecks = ch;
 
   // Certain intrinsics have different names than their representative
   // instructions. This field allows us to handle this correctly when we
@@ -300,9 +303,9 @@ class Inst <string n, string p, string t, Operation o> {
 // SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8", "p8")
 // IInst: Instruction with generic integer suffix (e.g., "i8")
 // WInst: Instruction with only bit size suffix (e.g., "8")
-class SInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
-class IInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
-class WInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
+class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
+class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 
 // The following instruction classes are implemented via operators
 // instead of builtins. As such these declarations are only used for
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 6ec357825a132..fdf4ba55fe938 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -13,6 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+include "arm_immcheck_incl.td"
+
 //===----------------------------------------------------------------------===//
 // Instruction definitions
 //===----------------------------------------------------------------------===//
@@ -233,40 +235,6 @@ def IsInZT0                         : FlagType<0x400000000000>;
 def IsOutZT0                        : FlagType<0x800000000000>;
 def IsInOutZT0                      : FlagType<0x1000000000000>;
 
-// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
-class ImmCheckType<int val> {
-  int Value = val;
-}
-def ImmCheck0_31                : ImmCheckType<0>;  // 0..31 (used for e.g. predicate patterns)
-def ImmCheck1_16                : ImmCheckType<1>;  // 1..16
-def ImmCheckExtract             : ImmCheckType<2>;  // 0..(2048/sizeinbits(elt) - 1)
-def ImmCheckShiftRight          : ImmCheckType<3>;  // 1..sizeinbits(elt)
-def ImmCheckShiftRightNarrow    : ImmCheckType<4>;  // 1..sizeinbits(elt)/2
-def ImmCheckShiftLeft           : ImmCheckType<5>;  // 0..(sizeinbits(elt) - 1)
-def ImmCheck0_7                 : ImmCheckType<6>;  // 0..7
-def ImmCheckLaneIndex           : ImmCheckType<7>;  // 0..(128/(1*sizeinbits(elt)) - 1)
-def ImmCheckLaneIndexCompRotate : ImmCheckType<8>;  // 0..(128/(2*sizeinbits(elt)) - 1)
-def ImmCheckLaneIndexDot        : ImmCheckType<9>;  // 0..(128/(4*sizeinbits(elt)) - 1)
-def ImmCheckComplexRot90_270    : ImmCheckType<10>; // [90,270]
-def ImmCheckComplexRotAll90     : ImmCheckType<11>; // [0, 90, 180,270]
-def ImmCheck0_13                : ImmCheckType<12>; // 0..13
-def ImmCheck0_1                 : ImmCheckType<13>; // 0..1
-def ImmCheck0_2                 : ImmCheckType<14>; // 0..2
-def ImmCheck0_3                 : ImmCheckType<15>; // 0..3
-def ImmCheck0_0                 : ImmCheckType<16>; // 0..0
-def ImmCheck0_15                : ImmCheckType<17>; // 0..15
-def ImmCheck0_255               : ImmCheckType<18>; // 0..255
-def ImmCheck2_4_Mul2            : ImmCheckType<19>; // 2, 4
-def ImmCheck1_1                 : ImmCheckType<20>; // 1..1
-def ImmCheck1_3                 : ImmCheckType<21>; // 1..3
-def ImmCheck1_7                 : ImmCheckType<22>; // 1..7
-
-class ImmCheck<int arg, ImmCheckType kind, int eltSizeArg = -1> {
-  int Arg = arg;
-  int EltSizeArg = eltSizeArg;
-  ImmCheckType Kind = kind;
-}
-
 defvar InvalidMode = "";
 
 class Inst<string n, string p, string t, MergeType mt, string i,
diff --git a/clang/include/clang/Sema/SemaARM.h b/clang/include/clang/Sema/SemaARM.h
index fedc7df7908f1..1ced84300c179 100644
--- a/clang/include/clang/Sema/SemaARM.h
+++ b/clang/include/clang/Sema/SemaARM.h
@@ -41,6 +41,9 @@ class SemaARM : public SemaBase {
                                     unsigned MaxWidth);
   bool CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                     CallExpr *TheCall);
+  bool ParseNeonImmChecks(CallExpr *TheCall, 
+                          SmallVector<std::tuple<int, int, int>, 2> &ImmChecks, 
+                          int OverloadType);
   bool CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index d8dd4fe16e3af..8f4d94e1df678 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -314,40 +314,6 @@ bool SemaARM::BuiltinARMSpecialReg(unsigned BuiltinID, CallExpr *TheCall,
   return false;
 }
 
-// Get the valid immediate range for the specified NEON type code.
-static unsigned RFT(unsigned t, bool shift = false, bool ForceQuad = false) {
-  NeonTypeFlags Type(t);
-  int IsQuad = ForceQuad ? true : Type.isQuad();
-  switch (Type.getEltType()) {
-  case NeonTypeFlags::Int8:
-  case NeonTypeFlags::Poly8:
-    return shift ? 7 : (8 << IsQuad) - 1;
-  case NeonTypeFlags::Int16:
-  case NeonTypeFlags::Poly16:
-    return shift ? 15 : (4 << IsQuad) - 1;
-  case NeonTypeFlags::Int32:
-    return shift ? 31 : (2 << IsQuad) - 1;
-  case NeonTypeFlags::Int64:
-  case NeonTypeFlags::Poly64:
-    return shift ? 63 : (1 << IsQuad) - 1;
-  case NeonTypeFlags::Poly128:
-    return shift ? 127 : (1 << IsQuad) - 1;
-  case NeonTypeFlags::Float16:
-    assert(!shift && "cannot shift float types!");
-    return (4 << IsQuad) - 1;
-  case NeonTypeFlags::Float32:
-    assert(!shift && "cannot shift float types!");
-    return (2 << IsQuad) - 1;
-  case NeonTypeFlags::Float64:
-    assert(!shift && "cannot shift float types!");
-    return (1 << IsQuad) - 1;
-  case NeonTypeFlags::BFloat16:
-    assert(!shift && "cannot shift float types!");
-    return (4 << IsQuad) - 1;
-  }
-  llvm_unreachable("Invalid NeonTypeFlag!");
-}
-
 /// getNeonEltType - Return the QualType corresponding to the elements of
 /// the vector type specified by the NeonTypeFlags.  This is used to check
 /// the pointer arguments for Neon load/store intrinsics.
@@ -403,6 +369,62 @@ enum ArmSMEState : unsigned {
   ArmZT0Mask = 0b11 << 2
 };
 
+bool SemaARM::ParseNeonImmChecks(CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 2> 
+                                &ImmChecks, int OverloadType = -1) {
+  int ArgIdx, CheckTy, ElementType;
+  bool hasError = false;
+
+  for (auto &I : ImmChecks) {
+    std::tie(ArgIdx, CheckTy, ElementType) = I;
+
+    NeonTypeFlags Type = (OverloadType != -1) ? 
+                          NeonTypeFlags(OverloadType) : NeonTypeFlags(ElementType); 
+          
+    switch((ArmImmCheckType)CheckTy) {
+      case ArmImmCheckType::ImmCheck0_3:
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 3);
+        break;
+      case ArmImmCheckType::ImmCheck0_63:
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 63);
+        break;
+      case ArmImmCheckType::ImmCheck0_7:
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 7);
+        break;
+      case ArmImmCheckType::ImmCheck1_16:
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 16);
+        break;
+      case ArmImmCheckType::ImmCheck1_32:
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 32);
+        break;
+      case ArmImmCheckType::ImmCheck1_64:
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 64);
+        break;
+      case ArmImmCheckType::ImmCheckLaneIndex:
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,  (64 << Type.isQuad()) / 
+                                                    Type.getEltSizeInBits() - 1);
+        break; 
+      case ArmImmCheckType::ImmCheckLaneQIndex:    // force to use quad
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 
+                                                    (128/Type.getEltSizeInBits()) - 1);
+        break;
+      case ArmImmCheckType::ImmCheckShiftLeft:
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 
+                                                    Type.getEltSizeInBits() - 1);
+        break;
+      case ArmImmCheckType::ImmCheckShiftRight:
+        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 
+                                                    1, Type.getEltSizeInBits());
+        break;
+      default:
+        llvm_unreachable("Invalid Neon immediate range typeflag!");
+        break;
+    }
+  }
+
+  return hasError;
+}
+
+
 bool SemaARM::ParseSVEImmChecks(
     CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 3> &ImmChecks) {
   // Perform all the immediate checks for this builtin call.
@@ -432,76 +454,76 @@ bool SemaARM::ParseSVEImmChecks(
       return false;
     };
 
-    switch ((SVETypeFlags::ImmCheckType)CheckTy) {
-    case SVETypeFlags::ImmCheck0_31:
+    switch ((ArmImmCheckType)CheckTy) {
+    case ArmImmCheckType::ImmCheck0_31:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 31))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck0_13:
+    case ArmImmCheckType::ImmCheck0_13:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 13))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck1_16:
+    case ArmImmCheckType::ImmCheck1_16:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1, 16))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck0_7:
+    case ArmImmCheckType::ImmCheck0_7:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 7))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck1_1:
+    case ArmImmCheckType::ImmCheck1_1:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1, 1))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck1_3:
+    case ArmImmCheckType::ImmCheck1_3:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1, 3))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck1_7:
+    case ArmImmCheckType::ImmCheck1_7:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1, 7))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheckExtract:
+    case ArmImmCheckType::ImmCheckExtract:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
                                           (2048 / ElementSizeInBits) - 1))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheckShiftRight:
+    case ArmImmCheckType::ImmCheckShiftRight:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1,
                                           ElementSizeInBits))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheckShiftRightNarrow:
+    case ArmImmCheckType::ImmCheckShiftRightNarrow:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 1,
                                           ElementSizeInBits / 2))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheckShiftLeft:
+    case ArmImmCheckType::ImmCheckShiftLeft:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
                                           ElementSizeInBits - 1))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheckLaneIndex:
+    case ArmImmCheckType::ImmCheckLaneIndex:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
                                           (128 / (1 * ElementSizeInBits)) - 1))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheckLaneIndexCompRotate:
+    case ArmImmCheckType::ImmCheckLaneIndexCompRotate:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
                                           (128 / (2 * ElementSizeInBits)) - 1))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheckLaneIndexDot:
+    case ArmImmCheckType::ImmCheckLaneIndexDot:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0,
                                           (128 / (4 * ElementSizeInBits)) - 1))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheckComplexRot90_270:
+    case ArmImmCheckType::ImmCheckComplexRot90_270:
       if (CheckImmediateInSet([](int64_t V) { return V == 90 || V == 270; },
                               diag::err_rotation_argument_to_cadd))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheckComplexRotAll90:
+    case ArmImmCheckType::ImmCheckComplexRotAll90:
       if (CheckImmediateInSet(
               [](int64_t V) {
                 return V == 0 || V == 90 || V == 180 || V == 270;
@@ -509,35 +531,38 @@ bool SemaARM::ParseSVEImmChecks(
               diag::err_rotation_argument_to_cmla))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck0_1:
+    case ArmImmCheckType::ImmCheck0_1:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 1))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck0_2:
+    case ArmImmCheckType::ImmCheck0_2:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 2))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck0_3:
+    case ArmImmCheckType::ImmCheck0_3:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 3))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck0_0:
+    case ArmImmCheckType::ImmCheck0_0:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 0))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck0_15:
+    case ArmImmCheckType::ImmCheck0_15:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 15))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck0_255:
+    case ArmImmCheckType::ImmCheck0_255:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 0, 255))
         HasError = true;
       break;
-    case SVETypeFlags::ImmCheck2_4_Mul2:
+    case ArmImmCheckType::ImmCheck2_4_Mul2:
       if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, 2, 4) ||
           SemaRef.BuiltinConstantArgMultiple(TheCall, ArgNum, 2))
         HasError = true;
       break;
+    default:
+      llvm_unreachable("Invalid SVE immediate range typeflag!");
+      break;
     }
   }
 
@@ -748,7 +773,7 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
 
   llvm::APSInt Result;
   uint64_t mask = 0;
-  unsigned TV = 0;
+  int TV = -1;
   int PtrArgNum = -1;
   bool HasConstPtr = false;
   switch (BuiltinID) {
@@ -800,7 +825,7 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
 
   // For NEON intrinsics which take an immediate value as part of the
   // instruction, range check them here.
-  unsigned i = 0, l = 0, u = 0;
+  SmallVector<std::tuple<int, int, int>, 2> ImmChecks;
   switch (BuiltinID) {
   default:
     return false;
@@ -808,9 +833,9 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
 #include "clang/Basic/arm_fp16.inc"
 #include "clang/Basic/arm_neon.inc"
 #undef GET_NEON_IMMEDIATE_CHECK
-  }
-
-  return SemaRef.BuiltinConstantArgRange(TheCall, i, l, u + l);
+    }
+    
+  return ParseNeonImmChecks(TheCall, ImmChecks, TV);
 }
 
 bool SemaARM::CheckMVEBuiltinFunctionCall(unsigned BuiltinID,
diff --git a/clang/test/CodeGen/aarch64-neon-vcmla.c b/clang/test/CodeGen/aarch64-neon-vcmla.c
index 02171527cc6a3..2ff48fd97b427 100644
--- a/clang/test/CodeGen/aarch64-neon-vcmla.c
+++ b/clang/test/CodeGen/aarch64-neon-vcmla.c
@@ -155,15 +155,14 @@ float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rh
   return vcmla_lane_f16(acc, lhs, rhs, 1);
 }
 
-// ACLE says this exists, but it won't map to a single instruction if lane > 1.
 // CHECK-LABEL: @test_vcmla_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK:  %0 = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK:  %1 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 poison, i32 1>
+// CHECK:  %2 = bitcast <2 x i32> %1 to <4 x half>
+// CHECK:  %3 = shufflevector <4 x half> %2, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK:  %vcmla_f163.i = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> 
 float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
-  return vcmla_laneq_f16(acc, lhs, rhs, 3);
+  return vcmla_laneq_f16(acc, lhs, rhs, 1);
 }
 
 // CHECK-LABEL: @test_vcmlaq_lane_f16(
@@ -191,7 +190,6 @@ float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rh
   return vcmla_lane_f32(acc, lhs, rhs, 0);
 }
 
-// ACLE says this exists, but it won't map to a single instruction if lane > 1.
 // CHECK-LABEL: @test_vcmla_laneq_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
@@ -229,15 +227,14 @@ float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x
   return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
 }
 
-// ACLE says this exists, but it won't map to a single instruction if lane > 1.
-// CHECK-LABEL: @test_vcmla_rot90_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK: %0 = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK: %1 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 poison>
+// CHECK: %2 = bitcast <2 x i32> %1 to <4 x half>
+// CHECK: %3 = shufflevector <4 x half> %2, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK: %vcmla_rot90_f163.i = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %3)
+// CHECK: ret <4 x half> %vcmla_rot90_f163.i
 float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
-  return vcmla_rot90_laneq_f16(acc, lhs, rhs, 3);
+  return vcmla_rot90_laneq_f16(acc, lhs, rhs, 0);
 }
 
 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f16(
@@ -265,7 +262,6 @@ float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x
   return vcmla_rot90_lane_f32(acc, lhs, rhs, 0);
 }
 
-// ACLE says this exists, but it won't map to a single instruction if lane > 1.
 // CHECK-LABEL: @test_vcmla_rot90_laneq_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
@@ -303,15 +299,15 @@ float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16
   return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
 }
 
-// ACLE says this exists, but it won't map to a single instruction if lane > 1.
 // CHECK-LABEL: @test_vcmla_rot180_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK:  %0 = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK:  %1 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 poison, i32 1>
+// CHECK:  %2 = bitcast <2 x i32> %1 to <4 x half>
+// CHECK:  %3 = shufflevector <4 x half> %2, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK:  %vcmla_rot180_f163.i = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %3)
+// CHECK:  ret <4 x half> %vcmla_rot180_f163.i
 float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
-  return vcmla_rot180_laneq_f16(acc, lhs, rhs, 3);
+  return vcmla_rot180_laneq_f16(acc, lhs, rhs, 1);
 }
 
 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f16(
@@ -339,7 +335,6 @@ float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32
   return vcmla_rot180_lane_f32(acc, lhs, rhs, 0);
 }
 
-// ACLE says this exists, but it won't map to a single instruction if lane > 1.
 // CHECK-LABEL: @test_vcmla_rot180_laneq_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
@@ -377,15 +372,15 @@ float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16
   return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
 }
 
-// ACLE says this exists, but it won't map to a single instruction if lane > 1.
 // CHECK-LABEL: @test_vcmla_rot270_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK:  %0 = bitcast <8 x half> %rhs to <4 x i32>
+// CHECK:  %1 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 poison>
+// CHECK:  %2 = bitcast <2 x i32> %1 to <4 x half>
+// CHECK:  %3 = shufflevector <4 x half> %2, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK:  %vcmla_rot270_f163.i = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %3)
+// CHECK:  ret <4 x half> %vcmla_rot270_f163.
 float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
-  return vcmla_rot270_laneq_f16(acc, lhs, rhs, 3);
+  return vcmla_rot270_laneq_f16(acc, lhs, rhs, 0);
 }
 
 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f16(
@@ -413,7 +408,6 @@ float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32
   return vcmla_rot270_lane_f32(acc, lhs, rhs, 0);
 }
 
-// ACLE says this exists, but it won't map to a single instruction if lane > 1.
 // CHECK-LABEL: @test_vcmla_rot270_laneq_f32(
 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
diff --git a/clang/test/Sema/aarch64-neon-vcmla-ranges.c b/clang/test/Sema/aarch64-neon-vcmla-ranges.c
new file mode 100644
index 0000000000000..9b42e68670da0
--- /dev/null
+++ b/clang/test/Sema/aarch64-neon-vcmla-ranges.c
@@ -0,0 +1,202 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +v8.3a -ffreestanding -fsyntax-only -verify %s
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+#include <arm_fp16.h>
+
+void test_vcmla_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c){
+  vcmla_lane_f16(a, b, c, 0);
+  vcmla_lane_f16(a, b, c, 1);
+
+  vcmla_lane_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c){
+  vcmla_laneq_f16(a, b, c, 0);
+  vcmla_laneq_f16(a, b, c, 1);
+
+  vcmla_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}} 
+  vcmla_laneq_f16(a, b, c, 2);  // expected-error-re +{{argument value {{.*}} is outside the valid range}} 
+}
+
+void test_vcmlaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c){
+  vcmlaq_lane_f16(a, b, c, 0);
+  vcmlaq_lane_f16(a, b, c, 1);
+
+  vcmlaq_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}} 
+  vcmlaq_lane_f16(a, b, c, 2);  // expected-error-re +{{argument value {{.*}} is outside the valid range}} 
+}
+
+void test_vcmlaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c){
+  vcmlaq_laneq_f16(a, b, c, 0);
+  vcmlaq_laneq_f16(a, b, c, 1);
+  vcmlaq_laneq_f16(a, b, c, 3);
+
+  vcmlaq_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}} 
+  vcmlaq_laneq_f16(a, b, c, 4);  // expected-error-re +{{argument value {{.*}} is outside the valid range}} 
+}
+
+void test_vcmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c){
+  vcmla_lane_f32(a, b, c, 0);
+
+  vcmla_lane_f32(a, b, c, 1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_lane_f32(a, b, c, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_lane_f32(a, b, c, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c){
+  vcmla_laneq_f32(a, b, c, 0);
+
+  vcmla_laneq_f32(a, b, c, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_laneq_f32(a, b, c, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c){
+  vcmlaq_laneq_f32(a, b, c, 0);
+  vcmlaq_laneq_f32(a, b, c, 1);
+
+  vcmlaq_laneq_f32(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_laneq_f32(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot90_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c){
+  vcmla_rot90_lane_f16(a, b, c, 0);
+  vcmla_rot90_lane_f16(a, b, c, 1);
+
+  vcmla_rot90_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot90_lane_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot90_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c){
+  vcmla_rot90_laneq_f16(a, b, c, 0);
+  vcmla_rot90_laneq_f16(a, b, c, 1);
+
+  vcmla_rot90_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot90_laneq_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot90_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c){
+  vcmlaq_rot90_laneq_f16(a, b, c, 0);
+  vcmlaq_rot90_laneq_f16(a, b, c, 3);
+
+  vcmlaq_rot90_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot90_laneq_f16(a, b, c, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot180_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c){
+  vcmla_rot180_lane_f16(a, b, c, 0);
+  vcmla_rot180_lane_f16(a, b, c, 1);
+
+  vcmla_rot180_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot180_lane_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot180_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c){
+  vcmla_rot180_laneq_f16(a, b, c, 0);
+  vcmla_rot180_laneq_f16(a, b, c, 1);
+
+  vcmla_rot180_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot180_laneq_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot180_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c){
+  vcmlaq_rot180_laneq_f16(a, b, c, 0);
+  vcmlaq_rot180_laneq_f16(a, b, c, 3);
+
+  vcmlaq_rot180_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot180_laneq_f16(a, b, c, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot270_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c){
+  vcmla_rot270_lane_f16(a, b, c, 0);
+  vcmla_rot270_lane_f16(a, b, c, 1);
+
+  vcmla_rot270_lane_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot270_lane_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot270_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c){
+  vcmla_rot270_laneq_f16(a, b, c, 0);
+  vcmla_rot270_laneq_f16(a, b, c, 1);
+
+  vcmla_rot270_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmla_rot270_laneq_f16(a, b, c, 2); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot270_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c){
+  vcmlaq_rot270_laneq_f16(a, b, c, 0);
+  vcmlaq_rot270_laneq_f16(a, b, c, 3);
+
+  vcmlaq_rot270_laneq_f16(a, b, c, -1); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot270_laneq_f16(a, b, c, 4); // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot90_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c){
+  vcmla_rot90_lane_f32(a, b, c, 0);
+
+  vcmla_rot90_lane_f32(a, b, c, 1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot90_lane_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot90_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c){
+  vcmla_rot90_laneq_f32(a, b, c, 0);
+  vcmla_rot90_laneq_f32(a, b, c, 1);
+
+  vcmla_rot90_laneq_f32(a, b, c, 2);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot90_laneq_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot90_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c){
+  vcmlaq_rot90_laneq_f32(a, b, c, 0);
+  vcmlaq_rot90_laneq_f32(a, b, c, 1);
+
+  vcmlaq_rot90_laneq_f32(a, b, c, 2);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot90_laneq_f32(a, b, c, -1);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot180_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c){
+  vcmla_rot180_lane_f32(a, b, c, 0);
+
+  vcmla_rot180_lane_f32(a, b, c, 1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot180_lane_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot180_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c){
+  vcmla_rot180_laneq_f32(a, b, c, 0);
+  vcmla_rot180_laneq_f32(a, b, c, 1);
+
+  vcmla_rot180_laneq_f32(a, b, c, 2);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot180_laneq_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot180_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c){
+  vcmlaq_rot90_laneq_f32(a, b, c, 0);
+  vcmlaq_rot90_laneq_f32(a, b, c, 1);
+
+  vcmlaq_rot90_laneq_f32(a, b, c, 2);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot90_laneq_f32(a, b, c, -1);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot270_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c){
+  vcmla_rot270_lane_f32(a, b, c, 0);
+
+  vcmla_rot270_lane_f32(a, b, c, 1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot270_lane_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmla_rot270_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c){
+  vcmla_rot270_laneq_f32(a, b, c, 0);
+  vcmla_rot270_laneq_f32(a, b, c, 1);
+
+  vcmla_rot270_laneq_f32(a, b, c, 2);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vcmla_rot270_laneq_f32(a, b, c, -1);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+}
+
+void test_vcmlaq_rot270_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c){
+  vcmlaq_rot270_laneq_f32(a, b, c, 0);
+  vcmlaq_rot270_laneq_f32(a, b, c, 1);
+
+  vcmlaq_rot270_laneq_f32(a, b, c, 2);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+  vcmlaq_rot270_laneq_f32(a, b, c, -1);  // expected-error-re +{{argument value {{.*}} is outside the valid range}}
+}
\ No newline at end of file
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 30fbb8c5d65e5..7666b53000edc 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -333,6 +333,8 @@ class Intrinsic {
 
   /// The types of return value [0] and parameters [1..].
   std::vector<Type> Types;
+
+  SmallVector<std::tuple<int, int, int>, 2> ImmChecks;
   /// The index of the key type passed to CGBuiltin.cpp for polymorphic calls.
   int PolymorphicKeyType;
   /// The local variables defined.
@@ -368,9 +370,9 @@ class Intrinsic {
 
 public:
   Intrinsic(Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
-            TypeSpec InTS, ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
+            TypeSpec InTS, ArrayRef<std::tuple<int, int, int>> ImmChecks, ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
             StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable, bool BigEndianSafe)
-      : R(R), Name(Name.str()), OutTS(OutTS), InTS(InTS), CK(CK), Body(Body),
+      : R(R), Name(Name.str()), OutTS(OutTS), InTS(InTS), ImmChecks(ImmChecks), CK(CK), Body(Body),
         ArchGuard(ArchGuard.str()), TargetGuard(TargetGuard.str()), IsUnavailable(IsUnavailable),
         BigEndianSafe(BigEndianSafe), PolymorphicKeyType(0), NeededEarly(false),
         UseMacro(false), BaseType(OutTS, "."), InBaseType(InTS, "."),
@@ -414,23 +416,22 @@ class Intrinsic {
   /// Get the architectural guard string (#ifdef).
   std::string getArchGuard() const { return ArchGuard; }
   std::string getTargetGuard() const { return TargetGuard; }
+  ArrayRef<std::tuple<int, int, int>> getImmChecks() const {return ImmChecks; }
   /// Get the non-mangled name.
   std::string getName() const { return Name; }
 
   /// Return true if the intrinsic takes an immediate operand.
   bool hasImmediate() const {
     return llvm::any_of(Types, [](const Type &T) { return T.isImmediate(); });
+    //return !ImmChecks.empty();
   }
 
-  /// Return the parameter index of the immediate operand.
-  unsigned getImmediateIdx() const {
-    for (unsigned Idx = 0; Idx < Types.size(); ++Idx)
-      if (Types[Idx].isImmediate())
-        return Idx - 1;
-    llvm_unreachable("Intrinsic has no immediate");
+  // Return if the supplied argument is an immediate
+  bool isArgImmediate(unsigned idx) const {
+    assert((idx + 1) < Types.size() && "Argument type index out of range!");
+    return Types[idx + 1].isImmediate();
   }
 
-
   unsigned getNumParams() const { return Types.size() - 1; }
   Type getReturnType() const { return Types[0]; }
   Type getParamType(unsigned I) const { return Types[I + 1]; }
@@ -554,9 +555,9 @@ class NeonEmitter {
                                      SmallVectorImpl<Intrinsic *> &Defs);
   void genOverloadTypeCheckCode(raw_ostream &OS,
                                 SmallVectorImpl<Intrinsic *> &Defs);
+  void genNeonImmCheckTypes(raw_ostream &OS);
   void genIntrinsicRangeCheckCode(raw_ostream &OS,
                                   SmallVectorImpl<Intrinsic *> &Defs);
-
 public:
   /// Called by Intrinsic - this attempts to get an intrinsic that takes
   /// the given types as arguments.
@@ -1031,7 +1032,7 @@ std::string Intrinsic::getBuiltinTypeStr() {
     if (LocalCK == ClassI && T.isInteger())
       T.makeSigned();
 
-    if (hasImmediate() && getImmediateIdx() == I)
+    if(isArgImmediate(I))
       T.makeImmediate(32);
 
     S += T.builtin_str();
@@ -1952,6 +1953,16 @@ void NeonEmitter::createIntrinsic(Record *R,
   bool BigEndianSafe  = R->getValueAsBit("BigEndianSafe");
   std::string ArchGuard = std::string(R->getValueAsString("ArchGuard"));
   std::string TargetGuard = std::string(R->getValueAsString("TargetGuard"));
+  std::vector<Record*> ImmCheckList = R->getValueAsListOfDefs("ImmChecks");
+
+  SmallVector<std::tuple<int, int, int>, 2> ImmChecks;
+  for(const auto *R: ImmCheckList) {
+
+    ImmChecks.push_back(std::make_tuple(R->getValueAsInt("Arg"), 
+                        R->getValueAsDef("Kind")->getValueAsInt("Value"),
+                        R->getValueAsInt("EltSizeArg")));
+  }
+
   bool IsUnavailable = OperationRec->getValueAsBit("Unavailable");
   std::string CartesianProductWith = std::string(R->getValueAsString("CartesianProductWith"));
 
@@ -1992,7 +2003,7 @@ void NeonEmitter::createIntrinsic(Record *R,
   auto &Entry = IntrinsicMap[Name];
 
   for (auto &I : NewTypeSpecs) {
-    Entry.emplace_back(R, Name, Proto, I.first, I.second, CK, Body, *this,
+    Entry.emplace_back(R, Name, Proto, I.first, I.second, ImmChecks, CK, Body, *this,
                        ArchGuard, TargetGuard, IsUnavailable, BigEndianSafe);
     Out.push_back(&Entry.back());
   }
@@ -2142,84 +2153,40 @@ void NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS,
   OS << "#endif\n\n";
 }
 
-void NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS,
-                                        SmallVectorImpl<Intrinsic *> &Defs) {
-  OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
+void NeonEmitter::genNeonImmCheckTypes(raw_ostream &OS) {
+  OS << "#ifdef GET_NEON_IMMCHECKTYPES\n";
+
+  for (auto *RV : Records.getAllDerivedDefinitions("ImmCheckType")) {
+    OS << "  " << RV->getNameInitAsString() << " = " << RV->getValueAsInt("Value") << ",\n";
+  }
 
+  OS << "#endif\n\n";
+}
+
+void NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS, SmallVectorImpl<Intrinsic *> &Defs) {
+  OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
+  int EltType;
+  // Ensure these are only emitted once.
   std::set<std::string> Emitted;
 
-  for (auto *Def : Defs) {
-    if (Def->hasBody())
-      continue;
-    // Functions which do not have an immediate do not need to have range
-    // checking code emitted.
-    if (!Def->hasImmediate())
-      continue;
-    if (Emitted.find(Def->getMangledName()) != Emitted.end())
+  for (auto &Def : Defs) {
+    if (Emitted.find(Def->getMangledName()) != Emitted.end() || !Def->hasImmediate())
       continue;
 
-    std::string LowerBound, UpperBound;
-
-    Record *R = Def->getRecord();
-    if (R->getValueAsBit("isVXAR")) {
-      //VXAR takes an immediate in the range [0, 63]
-      LowerBound = "0";
-      UpperBound = "63";
-    } else if (R->getValueAsBit("isVCVT_N")) {
-      // VCVT between floating- and fixed-point values takes an immediate
-      // in the range [1, 32) for f32 or [1, 64) for f64 or [1, 16) for f16.
-      LowerBound = "1";
-	  if (Def->getBaseType().getElementSizeInBits() == 16 ||
-		  Def->getName().find('h') != std::string::npos)
-		// VCVTh operating on FP16 intrinsics in range [1, 16)
-		UpperBound = "15";
-	  else if (Def->getBaseType().getElementSizeInBits() == 32)
-        UpperBound = "31";
-	  else
-        UpperBound = "63";
-    } else if (R->getValueAsBit("isScalarShift")) {
-      // Right shifts have an 'r' in the name, left shifts do not. Convert
-      // instructions have the same bounds and right shifts.
-      if (Def->getName().find('r') != std::string::npos ||
-          Def->getName().find("cvt") != std::string::npos)
-        LowerBound = "1";
-
-      UpperBound = utostr(Def->getReturnType().getElementSizeInBits() - 1);
-    } else if (R->getValueAsBit("isShift")) {
-      // Builtins which are overloaded by type will need to have their upper
-      // bound computed at Sema time based on the type constant.
-
-      // Right shifts have an 'r' in the name, left shifts do not.
-      if (Def->getName().find('r') != std::string::npos)
-        LowerBound = "1";
-      UpperBound = "RFT(TV, true)";
-    } else if (Def->getClassKind(true) == ClassB) {
-      // ClassB intrinsics have a type (and hence lane number) that is only
-      // known at runtime.
-      if (R->getValueAsBit("isLaneQ"))
-        UpperBound = "RFT(TV, false, true)";
-      else
-        UpperBound = "RFT(TV, false, false)";
-    } else {
-      // The immediate generally refers to a lane in the preceding argument.
-      assert(Def->getImmediateIdx() > 0);
-      Type T = Def->getParamType(Def->getImmediateIdx() - 1);
-      UpperBound = utostr(T.getNumElements() - 1);
-    }
+    // If the Def has a body (operation DAGs), it is not a __builtin_neon_
+    if(Def->hasBody()) continue;
 
-    // Calculate the index of the immediate that should be range checked.
-    unsigned Idx = Def->getNumParams();
-    if (Def->hasImmediate())
-      Idx = Def->getGeneratedParamIdx(Def->getImmediateIdx());
-
-    OS << "case NEON::BI__builtin_neon_" << Def->getMangledName() << ": "
-       << "i = " << Idx << ";";
-    if (!LowerBound.empty())
-      OS << " l = " << LowerBound << ";";
-    if (!UpperBound.empty())
-      OS << " u = " << UpperBound << ";";
-    OS << " break;\n";
+    OS << "case NEON::BI__builtin_neon_" << Def->getMangledName() << ":\n";
+    
+    for(const auto &Check: Def->getImmChecks()){
+      EltType = std::get<2>(Check);   // elt type argument
+      if(EltType >= 0)
+        EltType = Def->getParamType(EltType).getNeonEnum();
 
+      OS << "  ImmChecks.push_back(std::make_tuple(" << std::get<0>(Check) << 
+                ", " << std::get<1>(Check) <<  ", " << EltType << ")); \n";
+      OS << "  break;\n";
+    }
     Emitted.insert(Def->getMangledName());
   }
 
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index caedd5978a87c..027aa4b4c6bb2 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1573,7 +1573,7 @@ void SVEEmitter::createTypeFlags(raw_ostream &OS) {
     OS << "  " << KV.getKey() << " = " << KV.getValue() << ",\n";
   OS << "#endif\n\n";
 
-  OS << "#ifdef LLVM_GET_SVE_IMMCHECKTYPES\n";
+  OS << "#ifdef LLVM_GET_ARM_INTRIN_IMMCHECKTYPES\n";
   for (auto &KV : ImmCheckTypes)
     OS << "  " << KV.getKey() << " = " << KV.getValue() << ",\n";
   OS << "#endif\n\n";

>From 1d9084e9c246d2fcb395c329c6cf1ca19ef032aa Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Tue, 23 Jul 2024 22:28:48 +0000
Subject: [PATCH 2/3] Updated/consistent vcmla codegen tests

---
 clang/test/CodeGen/aarch64-neon-vcmla.c | 610 +++++++++++++++---------
 1 file changed, 384 insertions(+), 226 deletions(-)

diff --git a/clang/test/CodeGen/aarch64-neon-vcmla.c b/clang/test/CodeGen/aarch64-neon-vcmla.c
index 2ff48fd97b427..d82d74d019c01 100644
--- a/clang/test/CodeGen/aarch64-neon-vcmla.c
+++ b/clang/test/CodeGen/aarch64-neon-vcmla.c
@@ -1,438 +1,596 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon \
-// RUN:        -target-feature +v8.3a \
-// RUN:        -target-feature +fullfp16 \
-// RUN:        -disable-O0-optnone -emit-llvm -o - %s | opt -S -O1 | FileCheck %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon  \
+// RUN:            -target-feature +v8.3a -target-feature +fullfp16 \
+// RUN:            -disable-O0-optnone -emit-llvm -o - %s | opt -S -O1 | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vcmla_f16(
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_F163_I]]
+//
 float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_F323_I]]
+//
 float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_f16(
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_F163_I]]
+//
 float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_f32(
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_F323_I]]
+//
 float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_f64(
-// CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
-// CHECK: ret <2 x double> [[RES]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_f64(
+// CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_F643_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]])
+// CHECK-NEXT:    ret <2 x double> [[VCMLAQ_F643_I]]
+//
 float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
   return vcmlaq_f64(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot90_f16(
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot90_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT90_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT90_F163_I]]
+//
 float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot90_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot90_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot90_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT90_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT90_F323_I]]
+//
 float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot90_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_f16(
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot90_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT90_F163_I]]
+//
 float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot90_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_f32(
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot90_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT90_F323_I]]
+//
 float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot90_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_f64(
-// CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
-// CHECK: ret <2 x double> [[RES]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_rot90_f64(
+// CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F643_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]])
+// CHECK-NEXT:    ret <2 x double> [[VCMLAQ_ROT90_F643_I]]
+//
 float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
   return vcmlaq_rot90_f64(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_f16(
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot180_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT180_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT180_F163_I]]
+//
 float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot180_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot180_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT180_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT180_F323_I]]
+//
 float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot180_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_f16(
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot180_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT180_F163_I]]
+//
 float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot180_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_f32(
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot180_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT180_F323_I]]
+//
 float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot180_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_f64(
-// CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
-// CHECK: ret <2 x double> [[RES]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_rot180_f64(
+// CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F643_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]])
+// CHECK-NEXT:    ret <2 x double> [[VCMLAQ_ROT180_F643_I]]
+//
 float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
   return vcmlaq_rot180_f64(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_f16(
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot270_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT270_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT270_F163_I]]
+//
 float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot270_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot270_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT270_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT270_F323_I]]
+//
 float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot270_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_f16(
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot270_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT270_F163_I]]
+//
 float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot270_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_f32(
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot270_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT270_F323_I]]
+//
 float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot270_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_f64(
-// CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
-// CHECK: ret <2 x double> [[RES]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_rot270_f64(
+// CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F643_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]])
+// CHECK-NEXT:    ret <2 x double> [[VCMLAQ_ROT270_F643_I]]
+//
 float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
   return vcmlaq_rot270_f64(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[RHS]], <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLA_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP0]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_F163_I]]
+//
 float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmla_laneq_f16(
-// CHECK:  %0 = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK:  %1 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 poison, i32 1>
-// CHECK:  %2 = bitcast <2 x i32> %1 to <4 x half>
-// CHECK:  %3 = shufflevector <4 x half> %2, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK:  %vcmla_f163.i = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> 
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 poison, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLA_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP3]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_F163_I]]
+//
 float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
   return vcmla_laneq_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[RHS]], <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLAQ_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_F163_I]]
+//
 float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
   return vcmlaq_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32>
+// CHECK-NEXT:    [[VECINIT15:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT15]] to <8 x half>
+// CHECK-NEXT:    [[VCMLAQ_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_F163_I]]
+//
 float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmla_lane_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_F323_I]]
+//
 float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmla_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64>
+// CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <1 x i32> <i32 1>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float>
+// CHECK-NEXT:    [[VCMLA_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_F323_I]]
+//
 float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
   return vcmla_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_lane_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
-// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
-// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK-NEXT:    [[VCMLAQ_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_F323_I]]
+//
 float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_laneq_f32(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[RHS]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLAQ_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP0]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_F323_I]]
+//
 float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmla_rot90_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot90_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[RHS]], <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLA_ROT90_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP0]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT90_F163_I]]
+//
 float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK: %0 = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: %1 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 poison>
-// CHECK: %2 = bitcast <2 x i32> %1 to <4 x half>
-// CHECK: %3 = shufflevector <4 x half> %2, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK: %vcmla_rot90_f163.i = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %3)
-// CHECK: ret <4 x half> %vcmla_rot90_f163.i
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot90_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 poison>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK-NEXT:    [[VCMLA_ROT90_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP3]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT90_F163_I]]
+//
 float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
   return vcmla_rot90_laneq_f16(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot90_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[RHS]], <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT90_F163_I]]
+//
 float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
   return vcmlaq_rot90_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot90_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32>
+// CHECK-NEXT:    [[VECINIT15:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT15]] to <8 x half>
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT90_F163_I]]
+//
 float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot90_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmla_rot90_lane_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot90_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT90_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT90_F323_I]]
+//
 float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot90_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmla_rot90_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot90_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64>
+// CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <1 x i32> <i32 1>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float>
+// CHECK-NEXT:    [[VCMLA_ROT90_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT90_F323_I]]
+//
 float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
   return vcmla_rot90_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
-// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
-// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot90_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT90_F323_I]]
+//
 float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot90_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[RHS]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP0]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT90_F323_I]]
+//
 float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot180_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[RHS]], <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLA_ROT180_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP0]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT180_F163_I]]
+//
 float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_laneq_f16(
-// CHECK:  %0 = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK:  %1 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 poison, i32 1>
-// CHECK:  %2 = bitcast <2 x i32> %1 to <4 x half>
-// CHECK:  %3 = shufflevector <4 x half> %2, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK:  %vcmla_rot180_f163.i = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %3)
-// CHECK:  ret <4 x half> %vcmla_rot180_f163.i
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot180_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 poison, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLA_ROT180_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP3]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT180_F163_I]]
+//
 float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
   return vcmla_rot180_laneq_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot180_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[RHS]], <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT180_F163_I]]
+//
 float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
   return vcmlaq_rot180_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot180_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32>
+// CHECK-NEXT:    [[VECINIT15:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT15]] to <8 x half>
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT180_F163_I]]
+//
 float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot180_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_lane_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot180_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT180_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT180_F323_I]]
+//
 float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot180_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot180_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64>
+// CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <1 x i32> <i32 1>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float>
+// CHECK-NEXT:    [[VCMLA_ROT180_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT180_F323_I]]
+//
 float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
   return vcmla_rot180_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
-// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
-// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot180_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT180_F323_I]]
+//
 float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot180_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[RHS]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP0]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT180_F323_I]]
+//
 float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot270_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[RHS]], <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLA_ROT270_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP0]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT270_F163_I]]
+//
 float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_laneq_f16(
-// CHECK:  %0 = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK:  %1 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 poison>
-// CHECK:  %2 = bitcast <2 x i32> %1 to <4 x half>
-// CHECK:  %3 = shufflevector <4 x half> %2, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK:  %vcmla_rot270_f163.i = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %3)
-// CHECK:  ret <4 x half> %vcmla_rot270_f163.
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot270_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 poison>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK-NEXT:    [[VCMLA_ROT270_F163_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP3]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT270_F163_I]]
+//
 float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
   return vcmla_rot270_laneq_f16(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot270_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[RHS]], <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT270_F163_I]]
+//
 float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
   return vcmlaq_rot270_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot270_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32>
+// CHECK-NEXT:    [[VECINIT15:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT15]] to <8 x half>
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F163_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT270_F163_I]]
+//
 float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot270_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_lane_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot270_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT270_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT270_F323_I]]
+//
 float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot270_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot270_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64>
+// CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <1 x i32> <i32 1>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float>
+// CHECK-NEXT:    [[VCMLA_ROT270_F323_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT270_F323_I]]
+//
 float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
   return vcmla_rot270_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
-// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
-// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot270_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT270_F323_I]]
+//
 float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot270_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[RHS]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F323_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP0]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT270_F323_I]]
+//
 float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);
 }

>From 53216bc2003e03dfc69732f0a9f2753687c4a6ae Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Wed, 24 Jul 2024 11:33:27 +0000
Subject: [PATCH 3/3] Run clang-format

---
 clang/include/clang/Basic/TargetBuiltins.h | 51 ++++++------
 clang/include/clang/Sema/SemaARM.h         |  4 +-
 clang/lib/Sema/SemaARM.cpp                 | 93 +++++++++++-----------
 clang/utils/TableGen/NeonEmitter.cpp       | 54 ++++++++-----
 4 files changed, 106 insertions(+), 96 deletions(-)

diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 50e17ad7e1628..384811f9281ac 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -209,7 +209,7 @@ namespace clang {
         Flags |= QuadFlag;
     }
 
-    EltType getEltType() const { return (EltType)(Flags & EltTypeMask); } 
+    EltType getEltType() const { return (EltType)(Flags & EltTypeMask); }
     bool isPoly() const {
       EltType ET = getEltType();
       return ET == Poly8 || ET == Poly16 || ET == Poly64;
@@ -217,36 +217,36 @@ namespace clang {
     bool isUnsigned() const { return (Flags & UnsignedFlag) != 0; }
     bool isQuad() const { return (Flags & QuadFlag) != 0; };
     unsigned getEltSizeInBits() const {
-      switch(getEltType()){
-        case Int8:
-        case Poly8:
-          return 8;
-        case Int16:
-        case Float16:
-        case Poly16:
-        case BFloat16:
-          return 16;
-        case Int32:
-        case Float32:
-          return 32;
-        case Int64:
-        case Float64:
-        case Poly64:
-          return 64;
-        case Poly128:
-          return 128;
-        default:
-          llvm_unreachable("Invalid NeonTypeFlag!");
+      switch (getEltType()) {
+      case Int8:
+      case Poly8:
+        return 8;
+      case Int16:
+      case Float16:
+      case Poly16:
+      case BFloat16:
+        return 16;
+      case Int32:
+      case Float32:
+        return 32;
+      case Int64:
+      case Float64:
+      case Poly64:
+        return 64;
+      case Poly128:
+        return 128;
+      default:
+        llvm_unreachable("Invalid NeonTypeFlag!");
       }
     }
   };
 
-    // Shared between SVE/SME and NEON
-    enum ArmImmCheckType {
+  // Shared between SVE/SME and NEON
+  enum ArmImmCheckType {
 #define LLVM_GET_ARM_INTRIN_IMMCHECKTYPES
 #include "clang/Basic/arm_sve_typeflags.inc"
-#undef  LLVM_GET_ARM_INTRIN_IMMCHECKTYPES
-    };
+#undef LLVM_GET_ARM_INTRIN_IMMCHECKTYPES
+  };
 
   /// Flags to identify the types for overloaded SVE builtins.
   class SVETypeFlags {
@@ -279,7 +279,6 @@ namespace clang {
 #undef LLVM_GET_SVE_MERGETYPES
     };
 
-
     SVETypeFlags(uint64_t F) : Flags(F) {
       EltTypeShift = llvm::countr_zero(EltTypeMask);
       MemEltTypeShift = llvm::countr_zero(MemEltTypeMask);
diff --git a/clang/include/clang/Sema/SemaARM.h b/clang/include/clang/Sema/SemaARM.h
index 1ced84300c179..2f13e60f081c5 100644
--- a/clang/include/clang/Sema/SemaARM.h
+++ b/clang/include/clang/Sema/SemaARM.h
@@ -41,8 +41,8 @@ class SemaARM : public SemaBase {
                                     unsigned MaxWidth);
   bool CheckNeonBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                     CallExpr *TheCall);
-  bool ParseNeonImmChecks(CallExpr *TheCall, 
-                          SmallVector<std::tuple<int, int, int>, 2> &ImmChecks, 
+  bool ParseNeonImmChecks(CallExpr *TheCall,
+                          SmallVector<std::tuple<int, int, int>, 2> &ImmChecks,
                           int OverloadType);
   bool CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 8f4d94e1df678..bbcca1c72465a 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -369,62 +369,63 @@ enum ArmSMEState : unsigned {
   ArmZT0Mask = 0b11 << 2
 };
 
-bool SemaARM::ParseNeonImmChecks(CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 2> 
-                                &ImmChecks, int OverloadType = -1) {
+bool SemaARM::ParseNeonImmChecks(
+    CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 2> &ImmChecks,
+    int OverloadType = -1) {
   int ArgIdx, CheckTy, ElementType;
   bool hasError = false;
 
   for (auto &I : ImmChecks) {
     std::tie(ArgIdx, CheckTy, ElementType) = I;
 
-    NeonTypeFlags Type = (OverloadType != -1) ? 
-                          NeonTypeFlags(OverloadType) : NeonTypeFlags(ElementType); 
-          
-    switch((ArmImmCheckType)CheckTy) {
-      case ArmImmCheckType::ImmCheck0_3:
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 3);
-        break;
-      case ArmImmCheckType::ImmCheck0_63:
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 63);
-        break;
-      case ArmImmCheckType::ImmCheck0_7:
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 7);
-        break;
-      case ArmImmCheckType::ImmCheck1_16:
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 16);
-        break;
-      case ArmImmCheckType::ImmCheck1_32:
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 32);
-        break;
-      case ArmImmCheckType::ImmCheck1_64:
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 64);
-        break;
-      case ArmImmCheckType::ImmCheckLaneIndex:
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,  (64 << Type.isQuad()) / 
-                                                    Type.getEltSizeInBits() - 1);
-        break; 
-      case ArmImmCheckType::ImmCheckLaneQIndex:    // force to use quad
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 
-                                                    (128/Type.getEltSizeInBits()) - 1);
-        break;
-      case ArmImmCheckType::ImmCheckShiftLeft:
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 
-                                                    Type.getEltSizeInBits() - 1);
-        break;
-      case ArmImmCheckType::ImmCheckShiftRight:
-        hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 
-                                                    1, Type.getEltSizeInBits());
-        break;
-      default:
-        llvm_unreachable("Invalid Neon immediate range typeflag!");
-        break;
+    NeonTypeFlags Type = (OverloadType != -1) ? NeonTypeFlags(OverloadType)
+                                              : NeonTypeFlags(ElementType);
+
+    switch ((ArmImmCheckType)CheckTy) {
+    case ArmImmCheckType::ImmCheck0_3:
+      hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 3);
+      break;
+    case ArmImmCheckType::ImmCheck0_63:
+      hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 63);
+      break;
+    case ArmImmCheckType::ImmCheck0_7:
+      hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0, 7);
+      break;
+    case ArmImmCheckType::ImmCheck1_16:
+      hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 16);
+      break;
+    case ArmImmCheckType::ImmCheck1_32:
+      hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 32);
+      break;
+    case ArmImmCheckType::ImmCheck1_64:
+      hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1, 64);
+      break;
+    case ArmImmCheckType::ImmCheckLaneIndex:
+      hasError |= SemaRef.BuiltinConstantArgRange(
+          TheCall, ArgIdx, 0,
+          (64 << Type.isQuad()) / Type.getEltSizeInBits() - 1);
+      break;
+    case ArmImmCheckType::ImmCheckLaneQIndex: // force to use quad
+      hasError |= SemaRef.BuiltinConstantArgRange(
+          TheCall, ArgIdx, 0, (128 / Type.getEltSizeInBits()) - 1);
+      break;
+    case ArmImmCheckType::ImmCheckShiftLeft:
+      hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
+                                                  Type.getEltSizeInBits() - 1);
+      break;
+    case ArmImmCheckType::ImmCheckShiftRight:
+      hasError |= SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 1,
+                                                  Type.getEltSizeInBits());
+      break;
+    default:
+      llvm_unreachable("Invalid Neon immediate range typeflag!");
+      break;
     }
   }
 
   return hasError;
 }
 
-
 bool SemaARM::ParseSVEImmChecks(
     CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 3> &ImmChecks) {
   // Perform all the immediate checks for this builtin call.
@@ -833,8 +834,8 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
 #include "clang/Basic/arm_fp16.inc"
 #include "clang/Basic/arm_neon.inc"
 #undef GET_NEON_IMMEDIATE_CHECK
-    }
-    
+  }
+
   return ParseNeonImmChecks(TheCall, ImmChecks, TV);
 }
 
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 7666b53000edc..8dc6312525cf5 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -370,10 +370,13 @@ class Intrinsic {
 
 public:
   Intrinsic(Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
-            TypeSpec InTS, ArrayRef<std::tuple<int, int, int>> ImmChecks, ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
-            StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable, bool BigEndianSafe)
-      : R(R), Name(Name.str()), OutTS(OutTS), InTS(InTS), ImmChecks(ImmChecks), CK(CK), Body(Body),
-        ArchGuard(ArchGuard.str()), TargetGuard(TargetGuard.str()), IsUnavailable(IsUnavailable),
+            TypeSpec InTS, ArrayRef<std::tuple<int, int, int>> ImmChecks,
+            ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
+            StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable,
+            bool BigEndianSafe)
+      : R(R), Name(Name.str()), OutTS(OutTS), InTS(InTS), ImmChecks(ImmChecks),
+        CK(CK), Body(Body), ArchGuard(ArchGuard.str()),
+        TargetGuard(TargetGuard.str()), IsUnavailable(IsUnavailable),
         BigEndianSafe(BigEndianSafe), PolymorphicKeyType(0), NeededEarly(false),
         UseMacro(false), BaseType(OutTS, "."), InBaseType(InTS, "."),
         Emitter(Emitter) {
@@ -416,14 +419,14 @@ class Intrinsic {
   /// Get the architectural guard string (#ifdef).
   std::string getArchGuard() const { return ArchGuard; }
   std::string getTargetGuard() const { return TargetGuard; }
-  ArrayRef<std::tuple<int, int, int>> getImmChecks() const {return ImmChecks; }
+  ArrayRef<std::tuple<int, int, int>> getImmChecks() const { return ImmChecks; }
   /// Get the non-mangled name.
   std::string getName() const { return Name; }
 
   /// Return true if the intrinsic takes an immediate operand.
   bool hasImmediate() const {
     return llvm::any_of(Types, [](const Type &T) { return T.isImmediate(); });
-    //return !ImmChecks.empty();
+    // return !ImmChecks.empty();
   }
 
   // Return if the supplied argument is an immediate
@@ -558,6 +561,7 @@ class NeonEmitter {
   void genNeonImmCheckTypes(raw_ostream &OS);
   void genIntrinsicRangeCheckCode(raw_ostream &OS,
                                   SmallVectorImpl<Intrinsic *> &Defs);
+
 public:
   /// Called by Intrinsic - this attempts to get an intrinsic that takes
   /// the given types as arguments.
@@ -1032,7 +1036,7 @@ std::string Intrinsic::getBuiltinTypeStr() {
     if (LocalCK == ClassI && T.isInteger())
       T.makeSigned();
 
-    if(isArgImmediate(I))
+    if (isArgImmediate(I))
       T.makeImmediate(32);
 
     S += T.builtin_str();
@@ -1953,12 +1957,13 @@ void NeonEmitter::createIntrinsic(Record *R,
   bool BigEndianSafe  = R->getValueAsBit("BigEndianSafe");
   std::string ArchGuard = std::string(R->getValueAsString("ArchGuard"));
   std::string TargetGuard = std::string(R->getValueAsString("TargetGuard"));
-  std::vector<Record*> ImmCheckList = R->getValueAsListOfDefs("ImmChecks");
+  std::vector<Record *> ImmCheckList = R->getValueAsListOfDefs("ImmChecks");
 
   SmallVector<std::tuple<int, int, int>, 2> ImmChecks;
-  for(const auto *R: ImmCheckList) {
+  for (const auto *R : ImmCheckList) {
 
-    ImmChecks.push_back(std::make_tuple(R->getValueAsInt("Arg"), 
+    ImmChecks.push_back(
+        std::make_tuple(R->getValueAsInt("Arg"),
                         R->getValueAsDef("Kind")->getValueAsInt("Value"),
                         R->getValueAsInt("EltSizeArg")));
   }
@@ -2003,8 +2008,9 @@ void NeonEmitter::createIntrinsic(Record *R,
   auto &Entry = IntrinsicMap[Name];
 
   for (auto &I : NewTypeSpecs) {
-    Entry.emplace_back(R, Name, Proto, I.first, I.second, ImmChecks, CK, Body, *this,
-                       ArchGuard, TargetGuard, IsUnavailable, BigEndianSafe);
+    Entry.emplace_back(R, Name, Proto, I.first, I.second, ImmChecks, CK, Body,
+                       *this, ArchGuard, TargetGuard, IsUnavailable,
+                       BigEndianSafe);
     Out.push_back(&Entry.back());
   }
 
@@ -2157,34 +2163,38 @@ void NeonEmitter::genNeonImmCheckTypes(raw_ostream &OS) {
   OS << "#ifdef GET_NEON_IMMCHECKTYPES\n";
 
   for (auto *RV : Records.getAllDerivedDefinitions("ImmCheckType")) {
-    OS << "  " << RV->getNameInitAsString() << " = " << RV->getValueAsInt("Value") << ",\n";
+    OS << "  " << RV->getNameInitAsString() << " = "
+       << RV->getValueAsInt("Value") << ",\n";
   }
 
   OS << "#endif\n\n";
 }
 
-void NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS, SmallVectorImpl<Intrinsic *> &Defs) {
+void NeonEmitter::genIntrinsicRangeCheckCode(
+    raw_ostream &OS, SmallVectorImpl<Intrinsic *> &Defs) {
   OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
   int EltType;
   // Ensure these are only emitted once.
   std::set<std::string> Emitted;
 
   for (auto &Def : Defs) {
-    if (Emitted.find(Def->getMangledName()) != Emitted.end() || !Def->hasImmediate())
+    if (Emitted.find(Def->getMangledName()) != Emitted.end() ||
+        !Def->hasImmediate())
       continue;
 
     // If the Def has a body (operation DAGs), it is not a __builtin_neon_
-    if(Def->hasBody()) continue;
+    if (Def->hasBody())
+      continue;
 
     OS << "case NEON::BI__builtin_neon_" << Def->getMangledName() << ":\n";
-    
-    for(const auto &Check: Def->getImmChecks()){
-      EltType = std::get<2>(Check);   // elt type argument
-      if(EltType >= 0)
+
+    for (const auto &Check : Def->getImmChecks()) {
+      EltType = std::get<2>(Check); // elt type argument
+      if (EltType >= 0)
         EltType = Def->getParamType(EltType).getNeonEnum();
 
-      OS << "  ImmChecks.push_back(std::make_tuple(" << std::get<0>(Check) << 
-                ", " << std::get<1>(Check) <<  ", " << EltType << ")); \n";
+      OS << "  ImmChecks.push_back(std::make_tuple(" << std::get<0>(Check)
+         << ", " << std::get<1>(Check) << ", " << EltType << ")); \n";
       OS << "  break;\n";
     }
     Emitted.insert(Def->getMangledName());