[llvm] eabae1b - [AArch64][CodeGen] Always use SVE (when enabled) to lower 64-bit vector multiplies
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 8 07:37:58 PST 2022
Author: David Sherwood
Date: 2022-02-08T15:37:52Z
New Revision: eabae1b0175691d1f979299b22a25ed4474864a0
URL: https://github.com/llvm/llvm-project/commit/eabae1b0175691d1f979299b22a25ed4474864a0
DIFF: https://github.com/llvm/llvm-project/commit/eabae1b0175691d1f979299b22a25ed4474864a0.diff
LOG: [AArch64][CodeGen] Always use SVE (when enabled) to lower 64-bit vector multiplies
This patch adds custom lowering support for ISD::MUL with v1i64 and v2i64
types when SVE is enabled, regardless of the minimum SVE vector length. We
do this because NEON simply does not have 64-bit vector multiplies, so we
want to take advantage of these instructions in SVE.
I've updated the 128-bit min SVE vector bits tests here:
CodeGen/AArch64/sve-fixed-length-int-arith.ll
CodeGen/AArch64/sve-fixed-length-int-mulh.ll
CodeGen/AArch64/sve-fixed-length-int-rem.ll
Differential Revision: https://reviews.llvm.org/D118802
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bf97741dbe70b..34d13ec69c0ab 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1341,6 +1341,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UDIV, VT, Custom);
}
+ // NEON doesn't support 64-bit vector integer muls, but SVE does.
+ setOperationAction(ISD::MUL, MVT::v1i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.
if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -1367,8 +1371,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
- setOperationAction(ISD::MUL, MVT::v1i64, Custom);
- setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
@@ -3950,9 +3952,7 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
// If SVE is available then i64 vector multiplications can also be made legal.
bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
- if (VT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(
- VT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors()))
+ if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
// Multiplications are only custom-lowered for 128-bit vectors so that
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
index a34c4404ebd02..d54c3a969a27b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -1,4 +1,4 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
@@ -22,9 +22,6 @@
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; ADD
;
@@ -657,22 +654,32 @@ define void @mul_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
ret void
}
-; Vector i64 multiplications are not legal for NEON so use SVE when available.
define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
; CHECK-LABEL: mul_v1i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
; CHECK: ret
+
+; VBITS_EQ_128-LABEL: mul_v1i64:
+; VBITS_EQ_128: ptrue p0.d, vl1
+; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128: ret
+
%res = mul <1 x i64> %op1, %op2
ret <1 x i64> %res
}
-; Vector i64 multiplications are not legal for NEON so use SVE when available.
define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
; CHECK-LABEL: mul_v2i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
; CHECK: mul z0.d, [[PG]]/m, z0.d, z1.d
; CHECK: ret
+
+; VBITS_EQ_128-LABEL: mul_v2i64:
+; VBITS_EQ_128: ptrue p0.d, vl2
+; VBITS_EQ_128: mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128: ret
+
%res = mul <2 x i64> %op1, %op2
ret <2 x i64> %res
}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index 42e103694e051..710575a54477c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -1,4 +1,4 @@
-; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
@@ -25,14 +25,12 @@
target triple = "aarch64-unknown-linux-gnu"
-; Don't use SVE when its registers are no bigger than NEON.
-; NO_SVE-NOT: ptrue
-
;
; SMULH
;
; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; CHECK-LABEL: smulh_v8i8:
; CHECK: // %bb.0:
@@ -166,6 +164,7 @@ define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; CHECK-LABEL: smulh_v4i16:
; CHECK: // %bb.0:
@@ -294,6 +293,15 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: smulh_v2i32:
+; VBITS_EQ_128: sshll v0.2d, v0.2s, #0
+; VBITS_EQ_128-NEXT: sshll v1.2d, v1.2s, #0
+; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
+; VBITS_EQ_128-NEXT: ret
+
%1 = sext <2 x i32> %op1 to <2 x i64>
%2 = sext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2
@@ -521,6 +529,7 @@ define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; CHECK-LABEL: umulh_v8i8:
; CHECK: // %bb.0:
@@ -652,6 +661,7 @@ define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
+; FIXME: The codegen for the >=256 bits case can be improved.
define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; CHECK-LABEL: umulh_v4i16:
; CHECK: // %bb.0:
@@ -780,6 +790,15 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
+
+; VBITS_EQ_128-LABEL: umulh_v2i32:
+; VBITS_EQ_128: ushll v0.2d, v0.2s, #0
+; VBITS_EQ_128-NEXT: ushll v1.2d, v1.2s, #0
+; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
+; VBITS_EQ_128-NEXT: ret
+
%1 = zext <2 x i32> %op1 to <2 x i64>
%2 = zext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index c86e4f13dade3..a3aeed9c4aed8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -697,10 +697,7 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
; VBITS_EQ_128: ptrue p0.d, vl1
; VBITS_EQ_128-NEXT: movprfx z2, z0
; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: fmov x8, d2
-; VBITS_EQ_128-NEXT: fmov x9, d1
-; VBITS_EQ_128-NEXT: mul x8, x8, x9
-; VBITS_EQ_128-NEXT: fmov d1, x8
+; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
; VBITS_EQ_128-NEXT: sub d0, d0, d1
; VBITS_EQ_128-NEXT: ret
@@ -723,14 +720,7 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
; VBITS_EQ_128: ptrue p0.d, vl2
; VBITS_EQ_128-NEXT: movprfx z2, z0
; VBITS_EQ_128-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: fmov x9, d2
-; VBITS_EQ_128-NEXT: fmov x10, d1
-; VBITS_EQ_128-NEXT: mov x8, v2.d[1]
-; VBITS_EQ_128-NEXT: mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT: mul x9, x9, x10
-; VBITS_EQ_128-NEXT: mul x8, x8, x11
-; VBITS_EQ_128-NEXT: fmov d1, x9
-; VBITS_EQ_128-NEXT: mov v1.d[1], x8
+; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d
; VBITS_EQ_128-NEXT: ret
@@ -1487,10 +1477,7 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
; VBITS_EQ_128: ptrue p0.d, vl1
; VBITS_EQ_128-NEXT: movprfx z2, z0
; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: fmov x8, d2
-; VBITS_EQ_128-NEXT: fmov x9, d1
-; VBITS_EQ_128-NEXT: mul x8, x8, x9
-; VBITS_EQ_128-NEXT: fmov d1, x8
+; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
; VBITS_EQ_128-NEXT: sub d0, d0, d1
; VBITS_EQ_128-NEXT: ret
@@ -1513,14 +1500,7 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
; VBITS_EQ_128: ptrue p0.d, vl2
; VBITS_EQ_128-NEXT: movprfx z2, z0
; VBITS_EQ_128-NEXT: udiv z2.d, p0/m, z2.d, z1.d
-; VBITS_EQ_128-NEXT: fmov x9, d2
-; VBITS_EQ_128-NEXT: fmov x10, d1
-; VBITS_EQ_128-NEXT: mov x8, v2.d[1]
-; VBITS_EQ_128-NEXT: mov x11, v1.d[1]
-; VBITS_EQ_128-NEXT: mul x9, x9, x10
-; VBITS_EQ_128-NEXT: mul x8, x8, x11
-; VBITS_EQ_128-NEXT: fmov d1, x9
-; VBITS_EQ_128-NEXT: mov v1.d[1], x8
+; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z2.d
; VBITS_EQ_128-NEXT: sub v0.2d, v0.2d, v1.2d
; VBITS_EQ_128-NEXT: ret
More information about the llvm-commits
mailing list