[llvm] [AArch64][GlobalISel] More type support for G_VECREDUCE_ADD (PR #67433)

Tue Sep 26 07:24:30 PDT 2023

https://github.com/chuongg3 created https://github.com/llvm/llvm-project/pull/67433

G_VECREDUCE_ADD is now able to have v4i16 and v8i8 vector types as source registers

>From 6c1f7c208e8c9db35f234a2dd855697b64acb089 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 26 Sep 2023 10:18:59 +0100
Subject: [PATCH] [AArch64][GlobalISel] More type support for G_VECREDUCE_ADD

G_VECREDUCE_ADD is now able to have v4i16 and v8i8 vector types as source registers
---
 .../GISel/AArch64InstructionSelector.cpp      |  4 ++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  9 +++-
 llvm/test/CodeGen/AArch64/aarch64-addv.ll     | 42 +++++++++++++------
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 36 ++++++++++++++++
 4 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 60bb820585ac0a2..0bbdebb80590a10 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -3559,8 +3559,12 @@ bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
     unsigned Opc = 0;
     if (VecTy == LLT::fixed_vector(16, 8))
       Opc = AArch64::ADDVv16i8v;
+    else if (VecTy == LLT::fixed_vector(8, 8))
+      Opc = AArch64::ADDVv8i8v;
     else if (VecTy == LLT::fixed_vector(8, 16))
       Opc = AArch64::ADDVv8i16v;
+    else if (VecTy == LLT::fixed_vector(4, 16))
+      Opc = AArch64::ADDVv4i16v;
     else if (VecTy == LLT::fixed_vector(4, 32))
       Opc = AArch64::ADDVv4i32v;
     else if (VecTy == LLT::fixed_vector(2, 64))
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 8d3d94290b0e580..323b81f2175f3fb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -861,8 +861,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lower();
 
   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
-      .legalFor(
-          {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
+      .legalFor({{s8, v16s8},
+                 {s8, v8s8},
+                 {s16, v8s16},
+                 {s16, v4s16},
+                 {s32, v4s32},
+                 {s32, v2s32},
+                 {s64, v2s64}})
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
       .lower();
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 6cab309d7c094c2..f1798ccb1e3bbaa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -144,12 +144,21 @@ define i32 @oversized_ADDV_512(ptr %arr)  {
 }
 
 define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) {
-; CHECK-LABEL: addv_combine_i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: addv_combine_i8:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add v0.8b, v0.8b, v1.8b
+; SDAG-NEXT:    addv b0, v0.8b
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: addv_combine_i8:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    addv b0, v0.8b
+; GISEL-NEXT:    addv b1, v1.8b
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    add w0, w9, w8, uxtb
+; GISEL-NEXT:    ret
 entry:
   %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1)
   %rdx.2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a2)
@@ -158,12 +167,21 @@ entry:
 }
 
 define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) {
-; CHECK-LABEL: addv_combine_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    addv h0, v0.4h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: addv_combine_i16:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add v0.4h, v0.4h, v1.4h
+; SDAG-NEXT:    addv h0, v0.4h
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: addv_combine_i16:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    addv h0, v0.4h
+; GISEL-NEXT:    addv h1, v1.4h
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    add w0, w9, w8, uxth
+; GISEL-NEXT:    ret
 entry:
   %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1)
   %rdx.2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 039417784da0bba..4d2ec0ba7107ed9 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2,6 +2,28 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
 
+define i32 @addv_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: addv_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
+  ret i32 %arg1
+}
+
+define i16 @addv_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: addv_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
+  ret i16 %arg1
+}
+
 define i32 @add_v4i32_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: add_v4i32_v4i32:
 ; CHECK:       // %bb.0: // %entry
@@ -13,6 +35,17 @@ entry:
   ret i32 %z
 }
 
+define i8 @addv_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: addv_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
+  ret i8 %arg1
+}
+
 define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
 ; CHECK-LABEL: add_v4i32_v4i64_zext:
 ; CHECK:       // %bb.0: // %entry
@@ -2261,7 +2294,9 @@ entry:
 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
@@ -2269,3 +2304,4 @@ declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)