[llvm] eabae1b - [AArch64][CodeGen] Always use SVE (when enabled) to lower 64-bit vector multiplies

Tue Feb 8 07:37:58 PST 2022

Author: David Sherwood
Date: 2022-02-08T15:37:52Z
New Revision: eabae1b0175691d1f979299b22a25ed4474864a0

URL: https://github.com/llvm/llvm-project/commit/eabae1b0175691d1f979299b22a25ed4474864a0
DIFF: https://github.com/llvm/llvm-project/commit/eabae1b0175691d1f979299b22a25ed4474864a0.diff

LOG: [AArch64][CodeGen] Always use SVE (when enabled) to lower 64-bit vector multiplies

This patch adds custom lowering support for ISD::MUL with v1i64 and v2i64
types when SVE is enabled, regardless of the minimum SVE vector length. We
do this because NEON simply does not have 64-bit vector multiplies, so we
want to take advantage of these instructions in SVE.

I've updated the 128-bit min SVE vector bits tests here:

  CodeGen/AArch64/sve-fixed-length-int-arith.ll
  CodeGen/AArch64/sve-fixed-length-int-mulh.ll
  CodeGen/AArch64/sve-fixed-length-int-rem.ll

Differential Revision: https://reviews.llvm.org/D118802

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bf97741dbe70b..34d13ec69c0ab 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1341,6 +1341,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UDIV, VT, Custom);
     }
 
+    // NEON doesn't support 64-bit vector integer muls, but SVE does.
+    setOperationAction(ISD::MUL, MVT::v1i64, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
     // NOTE: Currently this has to happen after computeRegisterProperties rather
     // than the preferred option of combining it with the addRegisterClass call.
     if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -1367,8 +1371,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
       setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
       setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
-      setOperationAction(ISD::MUL, MVT::v1i64, Custom);
-      setOperationAction(ISD::MUL, MVT::v2i64, Custom);
       setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
       setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
       setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
@@ -3950,9 +3952,7 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   // If SVE is available then i64 vector multiplications can also be made legal.
   bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
 
-  if (VT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(
-          VT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors()))
+  if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
 
   // Multiplications are only custom-lowered for 128-bit vectors so that

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
index a34c4404ebd02..d54c3a969a27b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -1,4 +1,4 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=VBITS_EQ_128
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
@@ -22,9 +22,6 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; ADD
 ;
@@ -657,22 +654,32 @@ define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
   ret void
 }
 
-; Vector i64 multiplications are not legal for NEON so use SVE when available.
 define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK-LABEL: mul_v1i64:
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
 ; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: mul_v1i64:
+; VBITS_EQ_128:         ptrue p0.d, vl1
+; VBITS_EQ_128:         mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128:         ret
+
   %res = mul <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
 
-; Vector i64 multiplications are not legal for NEON so use SVE when available.
 define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-LABEL: mul_v2i64:
 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
 ; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
 ; CHECK: ret
+
+; VBITS_EQ_128-LABEL: mul_v2i64:
+; VBITS_EQ_128:         ptrue p0.d, vl2
+; VBITS_EQ_128:         mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128:         ret
+
   %res = mul <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index 42e103694e051..710575a54477c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -1,4 +1,4 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=VBITS_EQ_128
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
 ; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
@@ -25,14 +25,12 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
 ;
 ; SMULH
 ;
 
 ; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
 define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-LABEL: smulh_v8i8:
 ; CHECK:       // %bb.0:
@@ -166,6 +164,7 @@ define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
 define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: smulh_v4i16:
 ; CHECK:       // %bb.0:
@@ -294,6 +293,15 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
 ; CHECK-NEXT:    ret
+
+; VBITS_EQ_128-LABEL: smulh_v2i32:
+; VBITS_EQ_128:         sshll v0.2d, v0.2s, #0
+; VBITS_EQ_128-NEXT:    sshll v1.2d, v1.2s, #0
+; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT:    shrn v0.2s, v0.2d, #32
+; VBITS_EQ_128-NEXT:    ret
+
   %1 = sext <2 x i32> %op1 to <2 x i64>
   %2 = sext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -521,6 +529,7 @@ define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ;
 
 ; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
 define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-LABEL: umulh_v8i8:
 ; CHECK:       // %bb.0:
@@ -652,6 +661,7 @@ define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 }
 
 ; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
 define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: umulh_v4i16:
 ; CHECK:       // %bb.0:
@@ -780,6 +790,15 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
 ; CHECK-NEXT:    ret
+
+; VBITS_EQ_128-LABEL: umulh_v2i32:
+; VBITS_EQ_128:         ushll   v0.2d, v0.2s, #0
+; VBITS_EQ_128-NEXT:    ushll   v1.2d, v1.2s, #0
+; VBITS_EQ_128-NEXT:    ptrue   p0.d, vl2
+; VBITS_EQ_128-NEXT:    mul     z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT:    shrn    v0.2s, v0.2d, #32
+; VBITS_EQ_128-NEXT:    ret
+
   %1 = zext <2 x i32> %op1 to <2 x i64>
   %2 = zext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index c86e4f13dade3..a3aeed9c4aed8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -697,10 +697,7 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; VBITS_EQ_128:         ptrue p0.d, vl1
 ; VBITS_EQ_128-NEXT:    movprfx z2, z0
 ; VBITS_EQ_128-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    fmov x8, d2
-; VBITS_EQ_128-NEXT:    fmov x9, d1
-; VBITS_EQ_128-NEXT:    mul x8, x8, x9
-; VBITS_EQ_128-NEXT:    fmov d1, x8
+; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
 ; VBITS_EQ_128-NEXT:    sub d0, d0, d1
 ; VBITS_EQ_128-NEXT:    ret
 
@@ -723,14 +720,7 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; VBITS_EQ_128:         ptrue p0.d, vl2
 ; VBITS_EQ_128-NEXT:    movprfx z2, z0
 ; VBITS_EQ_128-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    fmov x9, d2
-; VBITS_EQ_128-NEXT:    fmov x10, d1
-; VBITS_EQ_128-NEXT:    mov x8, v2.d[1]
-; VBITS_EQ_128-NEXT:    mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT:    mul x9, x9, x10
-; VBITS_EQ_128-NEXT:    mul x8, x8, x11
-; VBITS_EQ_128-NEXT:    fmov d1, x9
-; VBITS_EQ_128-NEXT:    mov v1.d[1], x8
+; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
 ; VBITS_EQ_128-NEXT:    sub v0.2d, v0.2d, v1.2d
 ; VBITS_EQ_128-NEXT:    ret
 
@@ -1487,10 +1477,7 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; VBITS_EQ_128:         ptrue p0.d, vl1
 ; VBITS_EQ_128-NEXT:    movprfx z2, z0
 ; VBITS_EQ_128-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    fmov x8, d2
-; VBITS_EQ_128-NEXT:    fmov x9, d1
-; VBITS_EQ_128-NEXT:    mul x8, x8, x9
-; VBITS_EQ_128-NEXT:    fmov d1, x8
+; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
 ; VBITS_EQ_128-NEXT:    sub d0, d0, d1
 ; VBITS_EQ_128-NEXT:    ret
 
@@ -1513,14 +1500,7 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; VBITS_EQ_128:         ptrue p0.d, vl2
 ; VBITS_EQ_128-NEXT:    movprfx z2, z0
 ; VBITS_EQ_128-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT:    fmov x9, d2
-; VBITS_EQ_128-NEXT:    fmov x10, d1
-; VBITS_EQ_128-NEXT:    mov x8, v2.d[1]
-; VBITS_EQ_128-NEXT:    mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT:    mul x9, x9, x10
-; VBITS_EQ_128-NEXT:    mul x8, x8, x11
-; VBITS_EQ_128-NEXT:    fmov d1, x9
-; VBITS_EQ_128-NEXT:    mov v1.d[1], x8
+; VBITS_EQ_128-NEXT:    mul z1.d, p0/m, z1.d, z2.d
 ; VBITS_EQ_128-NEXT:    sub v0.2d, v0.2d, v1.2d
 ; VBITS_EQ_128-NEXT:    ret