[llvm] e69916c - [AArch64][GlobalISel] Legalize integer across-lane intrinsics with actual type

Thu Aug 17 09:21:18 PDT 2023

Author: Vladislav Dzhidzhoev
Date: 2023-08-17T18:19:56+02:00
New Revision: e69916c9430dcb29fe979111a6103957bfc70d64

URL: https://github.com/llvm/llvm-project/commit/e69916c9430dcb29fe979111a6103957bfc70d64
DIFF: https://github.com/llvm/llvm-project/commit/e69916c9430dcb29fe979111a6103957bfc70d64.diff

LOG: [AArch64][GlobalISel] Legalize integer across-lane intrinsics with actual type

Across-lane intrinsics with integer destination type (uaddv, saddv,
umaxv, smavx, uminv, sminv) were legalized with the destination type
given in the LLVM IR intrinsic’s definition. It was wider than
the actual destination type of the corresponding machine instruction.
InstructionSelect was implicitly supposed to generate underlying
extension instructions for these intrinsics, while the real destination
type was opaque for other GlobalISel passes.  Thus,
llvm/test/CodeGen/AArch64/arm64-vaddv.ll failed on GlobalISel since
the generated code was worse in functions that used the value of
an across-lane intrinsic in following FP&SIMD instructions (functions
with _used_by_laneop suffix).

Here intrinsics are legalized and selected with an actual destination
type, making it transparent to other passes. If the destination
value is used in further instructions accepting FPR registers, there
won’t be extra copies across register banks. i16 type is added to
the list of the types of the FPR16 register bank to make it possible,
and a few SelectionDAG patterns are modified to eliminate ambiguity
in TableGen.

Differential Revision: https://reviews.llvm.org/D156831

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrFormats.td
    llvm/lib/Target/AArch64/AArch64InstrGISel.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/AArch64RegisterInfo.td
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
    llvm/test/CodeGen/AArch64/arm64-smaxv.ll
    llvm/test/CodeGen/AArch64/arm64-sminv.ll
    llvm/test/CodeGen/AArch64/arm64-umaxv.ll
    llvm/test/CodeGen/AArch64/arm64-uminv.ll
    llvm/test/CodeGen/AArch64/arm64-vaddv.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 9eccfedd9b6529..1ff52f52009678 100644

--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -8601,7 +8601,7 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
                    (f16 (vector_extract (v8f16 V128:$Rn), (i64 0))),
                    (f16 (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx)))),
             (!cast<Instruction>(NAME # v1i16_indexed)
-              (EXTRACT_SUBREG V128:$Rn, hsub), V128:$Rm, VectorIndexH:$idx)>;
+              (f16 (EXTRACT_SUBREG V128:$Rn, hsub)), V128:$Rm, VectorIndexH:$idx)>;
   }
 
   let Predicates = [HasNEON] in {
@@ -9157,7 +9157,7 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
                                     (i64 0))))),
             (!cast<Instruction>(NAME # v1i32_indexed)
                         FPR32Op:$Rd,
-                        (EXTRACT_SUBREG V64:$Rn, hsub),
+                        (f16 (EXTRACT_SUBREG V64:$Rn, hsub)),
                         (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),
                         (i64 0))>;
 
@@ -9170,7 +9170,7 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
                                     (i64 0))))),
             (!cast<Instruction>(NAME # v1i32_indexed)
                         FPR32Op:$Rd,
-                        (EXTRACT_SUBREG V64:$Rn, hsub),
+                        (f16 (EXTRACT_SUBREG V64:$Rn, hsub)),
                         V128_lo:$Rm,
                         VectorIndexH:$idx)>;
 

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index b3d093af1c1657..f9f860607b5877 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -303,30 +303,43 @@ def : Pat<(int_aarch64_stlxp GPR64:$lo, GPR64:$hi, GPR64:$addr),
 def : Pat<(int_aarch64_stxp GPR64:$lo, GPR64:$hi, GPR64:$addr),
           (STXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>;
 
+let GIIgnoreCopies = 1 in
+class PatIgnoreCopies<dag pattern, dag result> : Pat<pattern, result>, GISelFlags;
+
 multiclass SIMDAcrossLanesSignedIntrinsicBHS<string baseOpc, Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+  def : PatIgnoreCopies<(i32 (sext (i8 (intOp (v8i8 V64:$Rn))))),
         (i32 (SMOVvi8to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
           (i64 0)))>;
-  def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+  def : Pat<(i8 (intOp (v8i8 V64:$Rn))),
+        (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn)>;
+
+  def : PatIgnoreCopies<(i32 (sext (i8 (intOp (v16i8 V128:$Rn))))),
         (i32 (SMOVvi8to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
           (i64 0)))>;
+  def : Pat<(i8 (intOp (v16i8 V128:$Rn))),
+        (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn)>;
 
-  def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+  def : PatIgnoreCopies<(i32 (sext (i16 (intOp (v4i16 V64:$Rn))))),
         (i32 (SMOVvi16to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
           (i64 0)))>;
-  def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+  def : Pat<(i16 (intOp (v4i16 V64:$Rn))),
+        (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn)>;
+
+  def : PatIgnoreCopies<(i32 (sext (i16 (intOp (v8i16 V128:$Rn))))),
         (i32 (SMOVvi16to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
           (i64 0)))>;
+  def : Pat<(i16 (intOp (v8i16 V128:$Rn))),
+        (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn)>;
 
-  def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+  def : PatIgnoreCopies<(i32 (intOp (v4i32 V128:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
@@ -335,29 +348,48 @@ multiclass SIMDAcrossLanesSignedIntrinsicBHS<string baseOpc, Intrinsic intOp> {
 
 multiclass SIMDAcrossLanesUnsignedIntrinsicBHS<string baseOpc,
                                                 Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          ssub))>;
-  def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          ssub))>;
+  def : PatIgnoreCopies<(i32 (zext (i8 (intOp (v8i8 V64:$Rn))))),
+        (COPY_TO_REGCLASS
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+            ssub)),
+          GPR32)>;
+  def : Pat<(i8 (intOp (v8i8 V64:$Rn))),
+        (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn)>;
+
+  def : PatIgnoreCopies<(i32 (zext (i8 (intOp (v16i8 V128:$Rn))))),
+        (COPY_TO_REGCLASS
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+            ssub)),
+        GPR32)>;
+  def : Pat<(i8 (intOp (v16i8 V128:$Rn))),
+        (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn)>;
+
 
-  def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+  def : PatIgnoreCopies<(i32 (zext (i16 (intOp (v4i16 V64:$Rn))))),
+        (COPY_TO_REGCLASS
           (i32 (EXTRACT_SUBREG
             (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
               (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-            ssub))>;
-  def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          ssub))>;
+            ssub)),
+          GPR32)>;
+  def : Pat<(i16 (intOp (v4i16 V64:$Rn))),
+        (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn)>;
 
-  def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+  def : PatIgnoreCopies<(i32 (zext (i16 (intOp (v8i16 V128:$Rn))))),
+        (COPY_TO_REGCLASS
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+            ssub)),
+        GPR32)>;
+  def : Pat<(i16 (intOp (v8i16 V128:$Rn))),
+        (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn)>;
+
+  def : PatIgnoreCopies<(i32 (intOp (v4i32 V128:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
@@ -373,12 +405,23 @@ def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))),
               (ADDPv2i32 V64:$Rn, V64:$Rn), dsub),
             ssub))>;
 
+def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))),
+          (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+              (ADDPv2i64p V128:$Rn), dsub),
+            dsub))>;
+
 defm : SIMDAcrossLanesUnsignedIntrinsicBHS<"ADDV", int_aarch64_neon_uaddv>;
 def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))),
           (i32 (EXTRACT_SUBREG
             (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
               (ADDPv2i32 V64:$Rn, V64:$Rn), dsub),
             ssub))>;
+def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))),
+          (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+              (ADDPv2i64p V128:$Rn), dsub),
+            dsub))>;
 
 defm : SIMDAcrossLanesSignedIntrinsicBHS<"SMAXV", int_aarch64_neon_smaxv>;
 def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))),

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 79f460765bce51..133e93e58b70b2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3679,27 +3679,28 @@ let Predicates = [IsLE, UseSTRQro] in {
 // Match stores from lane 0 to the appropriate subreg's store.
 multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
                               ValueType VecTy, ValueType STy,
+                              ValueType SubRegTy,
                               SubRegIndex SubRegIdx,
                               Instruction STRW, Instruction STRX> {
 
   def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
                      (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
-            (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+            (STRW (SubRegTy (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx)),
                   GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
   def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
                      (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
-            (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+            (STRX (SubRegTy (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx)),
                   GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
 let AddedComplexity = 19 in {
-  defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
-  defm : VecROStoreLane0Pat<ro16,         store, v8f16, f16, hsub, STRHroW, STRHroX>;
-  defm : VecROStoreLane0Pat<ro32,         store, v4i32, i32, ssub, STRSroW, STRSroX>;
-  defm : VecROStoreLane0Pat<ro32,         store, v4f32, f32, ssub, STRSroW, STRSroX>;
-  defm : VecROStoreLane0Pat<ro64,         store, v2i64, i64, dsub, STRDroW, STRDroX>;
-  defm : VecROStoreLane0Pat<ro64,         store, v2f64, f64, dsub, STRDroW, STRDroX>;
+  defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, f16, hsub, STRHroW, STRHroX>;
+  defm : VecROStoreLane0Pat<ro16,         store, v8f16, f16, f16, hsub, STRHroW, STRHroX>;
+  defm : VecROStoreLane0Pat<ro32,         store, v4i32, i32, i32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro32,         store, v4f32, f32, i32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro64,         store, v2i64, i64, i64, dsub, STRDroW, STRDroX>;
+  defm : VecROStoreLane0Pat<ro64,         store, v2f64, f64, i64, dsub, STRDroW, STRDroX>;
 }
 
 //---
@@ -3818,21 +3819,22 @@ def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
 // Match stores from lane 0 to the appropriate subreg's store.
 multiclass VecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
                             ValueType VTy, ValueType STy,
+                            ValueType SubRegTy,
                             SubRegIndex SubRegIdx, Operand IndexType,
                             Instruction STR> {
   def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)),
                      (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
-            (STR (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+            (STR (SubRegTy (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx)),
                  GPR64sp:$Rn, IndexType:$offset)>;
 }
 
 let AddedComplexity = 19 in {
-  defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, hsub, uimm12s2, STRHui>;
-  defm : VecStoreLane0Pat<am_indexed16,         store, v8f16, f16, hsub, uimm12s2, STRHui>;
-  defm : VecStoreLane0Pat<am_indexed32,         store, v4i32, i32, ssub, uimm12s4, STRSui>;
-  defm : VecStoreLane0Pat<am_indexed32,         store, v4f32, f32, ssub, uimm12s4, STRSui>;
-  defm : VecStoreLane0Pat<am_indexed64,         store, v2i64, i64, dsub, uimm12s8, STRDui>;
-  defm : VecStoreLane0Pat<am_indexed64,         store, v2f64, f64, dsub, uimm12s8, STRDui>;
+  defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, f16, hsub, uimm12s2, STRHui>;
+  defm : VecStoreLane0Pat<am_indexed16,         store, v8f16, f16, f16, hsub, uimm12s2, STRHui>;
+  defm : VecStoreLane0Pat<am_indexed32,         store, v4i32, i32, i32, ssub, uimm12s4, STRSui>;
+  defm : VecStoreLane0Pat<am_indexed32,         store, v4f32, f32, i32, ssub, uimm12s4, STRSui>;
+  defm : VecStoreLane0Pat<am_indexed64,         store, v2i64, i64, i64, dsub, uimm12s8, STRDui>;
+  defm : VecStoreLane0Pat<am_indexed64,         store, v2f64, f64, i64, dsub, uimm12s8, STRDui>;
 }
 
 //---
@@ -3961,17 +3963,18 @@ def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
 // Match stores from lane 0 to the appropriate subreg's store.
 multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
                              ValueType VTy, ValueType STy,
+                             ValueType SubRegTy,
                              SubRegIndex SubRegIdx, Instruction STR> {
-  defm : VecStoreLane0Pat<am_unscaled128, StoreOp, VTy, STy, SubRegIdx, simm9, STR>;
+  defm : VecStoreLane0Pat<am_unscaled128, StoreOp, VTy, STy, SubRegTy, SubRegIdx, simm9, STR>;
 }
 
 let AddedComplexity = 19 in {
-  defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, hsub, STURHi>;
-  defm : VecStoreULane0Pat<store,         v8f16, f16, hsub, STURHi>;
-  defm : VecStoreULane0Pat<store,         v4i32, i32, ssub, STURSi>;
-  defm : VecStoreULane0Pat<store,         v4f32, f32, ssub, STURSi>;
-  defm : VecStoreULane0Pat<store,         v2i64, i64, dsub, STURDi>;
-  defm : VecStoreULane0Pat<store,         v2f64, f64, dsub, STURDi>;
+  defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, f16, hsub, STURHi>;
+  defm : VecStoreULane0Pat<store,         v8f16, f16, f16, hsub, STURHi>;
+  defm : VecStoreULane0Pat<store,         v4i32, i32, i32, ssub, STURSi>;
+  defm : VecStoreULane0Pat<store,         v4f32, f32, i32, ssub, STURSi>;
+  defm : VecStoreULane0Pat<store,         v2i64, i64, i64, dsub, STURDi>;
+  defm : VecStoreULane0Pat<store,         v2f64, f64, i64, dsub, STURDi>;
 }
 
 //---
@@ -4496,7 +4499,7 @@ multiclass FMULScalarFromIndexedLane0Patterns<string inst,
   def : Pat<(f16 (OpNode (f16 FPR16:$Rn),
                          (f16 (vector_extract (v8f16 V128:$Rm), (i64 0))))),
             (!cast<Instruction>(inst # inst_f16_suffix)
-              FPR16:$Rn, (EXTRACT_SUBREG V128:$Rm, hsub))>;
+              FPR16:$Rn, (f16 (EXTRACT_SUBREG V128:$Rm, hsub)))>;
   }
   let Predicates = preds in {
   def : Pat<(f32 (OpNode (f32 FPR32:$Rn),
@@ -7064,19 +7067,19 @@ def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
 // Patterns for FP16 Intrinsics - requires reg copy to/from as i16s not supported.
 
 def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)),
-          (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+          (SCVTFh (f16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>;
 def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)),
-          (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+          (SCVTFh (f16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>;
 def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
-          (SCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>;
+          (SCVTFh (f16 (EXTRACT_SUBREG FPR64:$Rn, hsub)), vecshiftR16:$imm)>;
 def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp
             (and FPR32:$Rn, (i32 65535)),
             vecshiftR16:$imm)),
-          (UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+          (UCVTFh (f16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>;
 def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR16:$imm)),
-          (UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
+          (UCVTFh (f16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>;
 def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
-          (UCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>;
+          (UCVTFh (f16 (EXTRACT_SUBREG FPR64:$Rn, hsub)), vecshiftR16:$imm)>;
 def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR32:$imm)),
           (i32 (INSERT_SUBREG
             (i32 (IMPLICIT_DEF)),

diff  --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index c7170b17c3a9cb..9aba263da4f47b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -438,7 +438,7 @@ def Q31   : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
 def FPR8  : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> {
   let Size = 8;
 }
-def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> {
+def FPR16 : RegisterClass<"AArch64", [f16, bf16, i16], 16, (sequence "H%u", 0, 31)> {
   let Size = 16;
 }
 

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index b17f12d82b7939..61f1350c5eeb43 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1134,7 +1134,8 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
 
 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                              MachineInstr &MI) const {
-  switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
+  Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
+  switch (IntrinsicID) {
   case Intrinsic::vacopy: {
     unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
     unsigned VaListSize =
@@ -1214,6 +1215,36 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     MI.eraseFromParent();
     return true;
   }
+  case Intrinsic::aarch64_neon_uaddv:
+  case Intrinsic::aarch64_neon_saddv:
+  case Intrinsic::aarch64_neon_umaxv:
+  case Intrinsic::aarch64_neon_smaxv:
+  case Intrinsic::aarch64_neon_uminv:
+  case Intrinsic::aarch64_neon_sminv: {
+    MachineIRBuilder MIB(MI);
+    MachineRegisterInfo &MRI = *MIB.getMRI();
+    bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
+                    IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
+                    IntrinsicID == Intrinsic::aarch64_neon_sminv;
+
+    auto OldDst = MI.getOperand(0).getReg();
+    auto OldDstTy = MRI.getType(OldDst);
+    LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType();
+    if (OldDstTy == NewDstTy)
+      return true;
+
+    auto NewDst = MRI.createGenericVirtualRegister(NewDstTy);
+
+    Helper.Observer.changingInstr(MI);
+    MI.getOperand(0).setReg(NewDst);
+    Helper.Observer.changedInstr(MI);
+
+    MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt());
+    MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
+                        OldDst, NewDst);
+
+    return true;
+  }
   }
 
   return true;

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 46e9436d4b94b0..8ca2bc641b14a7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -493,8 +493,12 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
     return false;
   case Intrinsic::aarch64_neon_uaddlv:
   case Intrinsic::aarch64_neon_uaddv:
+  case Intrinsic::aarch64_neon_saddv:
   case Intrinsic::aarch64_neon_umaxv:
+  case Intrinsic::aarch64_neon_smaxv:
   case Intrinsic::aarch64_neon_uminv:
+  case Intrinsic::aarch64_neon_sminv:
+  case Intrinsic::aarch64_neon_faddv:
   case Intrinsic::aarch64_neon_fmaxv:
   case Intrinsic::aarch64_neon_fminv:
   case Intrinsic::aarch64_neon_fmaxnmv:
@@ -505,13 +509,6 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
     return SrcTy.getElementType().getSizeInBits() >= 16 &&
            SrcTy.getElementCount().getFixedValue() >= 4;
   }
-  case Intrinsic::aarch64_neon_saddv:
-  case Intrinsic::aarch64_neon_smaxv:
-  case Intrinsic::aarch64_neon_sminv: {
-    const LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
-    return SrcTy.getElementType().getSizeInBits() >= 32 &&
-           SrcTy.getElementCount().getFixedValue() >= 2;
-  }
   }
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/arm64-smaxv.ll b/llvm/test/CodeGen/AArch64/arm64-smaxv.ll
index 4ead34f5a69fd9..cae7c99558c755 100644
--- a/llvm/test/CodeGen/AArch64/arm64-smaxv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-smaxv.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
 ; CHECK: test_vmaxv_s8

diff  --git a/llvm/test/CodeGen/AArch64/arm64-sminv.ll b/llvm/test/CodeGen/AArch64/arm64-sminv.ll
index 3f2296d726a963..683044c9e8e0ba 100644
--- a/llvm/test/CodeGen/AArch64/arm64-sminv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-sminv.ll
@@ -1,14 +1,11 @@
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
-; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s --check-prefix=CHECK --check-prefix=GISEL
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define signext i8 @test_vminv_s8(<8 x i8> %a1) {
 ; CHECK: test_vminv_s8
 ; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0
-; SDAG-NEXT: smov.b w0, v[[REGNUM]][0]
-; SDAG-NEXT: ret
-; GISEL-NEXT: smov.b w8, v[[REGNUM]][0]
-; GISEL-NEXT: sxtb  w0, w8
-; GISEL-NEXT: ret
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
 entry:
   %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vminv.i to i8
@@ -18,11 +15,8 @@ entry:
 define signext i16 @test_vminv_s16(<4 x i16> %a1) {
 ; CHECK: test_vminv_s16
 ; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0
-; SDAG-NEXT: smov.h w0, v[[REGNUM]][0]
-; SDAG-NEXT: ret
-; GISEL-NEXT: smov.h w8, v[[REGNUM]][0]
-; GISEL-NEXT: sxth  w0, w8
-; GISEL-NEXT: ret
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
 entry:
   %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vminv.i to i16
@@ -43,11 +37,8 @@ entry:
 define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
 ; CHECK: test_vminvq_s8
 ; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0
-; SDAG-NEXT: smov.b w0, v[[REGNUM]][0]
-; SDAG-NEXT: ret
-; GISEL-NEXT: smov.b w8, v[[REGNUM]][0]
-; GISEL-NEXT: sxtb  w0, w8
-; GISEL-NEXT: ret
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
 entry:
   %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vminv.i to i8
@@ -57,11 +48,8 @@ entry:
 define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
 ; CHECK: test_vminvq_s16
 ; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0
-; SDAG-NEXT: smov.h w0, v[[REGNUM]][0]
-; SDAG-NEXT: ret
-; GISEL-NEXT: smov.h w8, v[[REGNUM]][0]
-; GISEL-NEXT: sxth  w0, w8
-; GISEL-NEXT: ret
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
 entry:
   %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vminv.i to i16
@@ -81,11 +69,8 @@ entry:
 define <8 x i8> @test_vminv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
 ; CHECK-LABEL: test_vminv_s8_used_by_laneop:
 ; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v1
-; SDAG-NEXT: mov.b v0[3], v[[REGNUM]][0]
-; SDAG-NEXT: ret
-; GISEL-NEXT: smov.b  w8, v[[REGNUM]][0]
-; GISEL-NEXT: mov.b v0[3], w8
-; GISEL-NEXT: ret
+; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
+; CHECK-NEXT: ret
 entry:
   %0 = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a2)
   %1 = trunc i32 %0 to i8
@@ -96,11 +81,8 @@ entry:
 define <4 x i16> @test_vminv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
 ; CHECK-LABEL: test_vminv_s16_used_by_laneop:
 ; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v1
-; SDAG-NEXT: mov.h v0[3], v[[REGNUM]][0]
-; SDAG-NEXT: ret
-; GISEL-NEXT: smov.h  w8, v[[REGNUM]][0]
-; GISEL-NEXT: mov.h v0[3], w8
-; GISEL-NEXT: ret
+; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
+; CHECK-NEXT: ret
 entry:
   %0 = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a2)
   %1 = trunc i32 %0 to i16
@@ -122,11 +104,8 @@ entry:
 define <16 x i8> @test_vminvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: test_vminvq_s8_used_by_laneop:
 ; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v1
-; SDAG-NEXT: mov.b v0[3], v[[REGNUM]][0]
-; SDAG-NEXT: ret
-; GISEL-NEXT: smov.b  w8, v[[REGNUM]][0]
-; GISEL-NEXT: mov.b v0[3], w8
-; GISEL-NEXT: ret
+; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
+; CHECK-NEXT: ret
 entry:
   %0 = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a2)
   %1 = trunc i32 %0 to i8
@@ -137,11 +116,8 @@ entry:
 define <8 x i16> @test_vminvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: test_vminvq_s16_used_by_laneop:
 ; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v1
-; SDAG-NEXT: mov.h v0[3], v[[REGNUM]][0]
-; SDAG-NEXT: ret
-; GISEL-NEXT: smov.h  w8, v[[REGNUM]][0]
-; GISEL-NEXT: mov.h v0[3], w8
-; GISEL-NEXT: ret
+; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
+; CHECK-NEXT: ret
 entry:
   %0 = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a2)
   %1 = trunc i32 %0 to i16

diff  --git a/llvm/test/CodeGen/AArch64/arm64-umaxv.ll b/llvm/test/CodeGen/AArch64/arm64-umaxv.ll
index 505dd1668104f7..128ede01c83539 100644
--- a/llvm/test/CodeGen/AArch64/arm64-umaxv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-umaxv.ll
@@ -1,13 +1,12 @@
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s --check-prefix CHECK --check-prefix SDAG
-; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s --check-prefix CHECK --check-prefix GISEL
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-LABEL: vmax_u8x8:
 ; CHECK: umaxv.8b        b[[REG:[0-9]+]], v0
 ; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
 ; CHECK-NOT: and
-; SDAG: cbz     [[REG2]],
-; GISEL: b
+; CHECK: cbz     [[REG2]],
 entry:
   %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i8
@@ -30,8 +29,7 @@ define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
 ; CHECK: umaxv.4h        h[[REG:[0-9]+]], v0
 ; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
 ; CHECK-NOT: and
-; SDAG: cbz     [[REG2]],
-; GISEL: b
+; CHECK: cbz     [[REG2]],
 entry:
   %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i16
@@ -52,8 +50,7 @@ define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
 ; CHECK: umaxv.8h        h[[REG:[0-9]+]], v0
 ; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
 ; CHECK-NOT: and
-; SDAG: cbz     [[REG2]],
-; GISEL: b
+; CHECK: cbz     [[REG2]],
 entry:
   %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i16
@@ -74,8 +71,7 @@ define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
 ; CHECK: umaxv.16b        b[[REG:[0-9]+]], v0
 ; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
 ; CHECK-NOT: and
-; SDAG: cbz     [[REG2]],
-; GISEL: b
+; CHECK: cbz     [[REG2]],
 entry:
   %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i8

diff  --git a/llvm/test/CodeGen/AArch64/arm64-uminv.ll b/llvm/test/CodeGen/AArch64/arm64-uminv.ll
index ec488feb1a7e69..98b3d6de62976f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-uminv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-uminv.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-LABEL: vmin_u8x8:

diff  --git a/llvm/test/CodeGen/AArch64/arm64-vaddv.ll b/llvm/test/CodeGen/AArch64/arm64-vaddv.ll
index f5da7994831683..04e19dce9ad75a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vaddv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vaddv.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -global-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false -mcpu=cyclone | FileCheck %s
 
 define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
 ; CHECK-LABEL: test_vaddv_s8: