[llvm] 95efea4 - [AArch64][SVE] Widen narrow sdiv/udiv operations.
Eli Friedman via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 4 13:22:43 PDT 2020
Author: Eli Friedman
Date: 2020-08-04T13:22:15-07:00
New Revision: 95efea4b9310bb204a42fbf29abd4efa65647000
URL: https://github.com/llvm/llvm-project/commit/95efea4b9310bb204a42fbf29abd4efa65647000
DIFF: https://github.com/llvm/llvm-project/commit/95efea4b9310bb204a42fbf29abd4efa65647000.diff
LOG: [AArch64][SVE] Widen narrow sdiv/udiv operations.
The SVE instruction set only supports sdiv/udiv for 32-bit and 64-bit
integers. If we see an 8-bit or 16-bit divide, widen the operands to 32
bits, and narrow the result.
Differential Revision: https://reviews.llvm.org/D85170
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 402d7656ca21..03b33086e0c8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3536,9 +3536,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::INSERT_SUBVECTOR:
return LowerINSERT_SUBVECTOR(Op, DAG);
case ISD::SDIV:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
case ISD::UDIV:
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
+ return LowerDIV(Op, DAG);
case ISD::SMIN:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
case ISD::UMIN:
@@ -8791,6 +8790,35 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
return SDValue();
}
+SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ bool Signed = Op.getOpcode() == ISD::SDIV;
+ unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
+ if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
+ return LowerToPredicatedOp(Op, DAG, PredOpcode);
+
+ // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
+ // operations, and truncate the result.
+ EVT WidenedVT;
+ if (VT == MVT::nxv16i8)
+ WidenedVT = MVT::nxv8i16;
+ else if (VT == MVT::nxv8i16)
+ WidenedVT = MVT::nxv4i32;
+ else
+ llvm_unreachable("Unexpected Custom DIV operation");
+
+ SDLoc dl(Op);
+ unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
+ unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
+ SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
+ SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
+ SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
+ SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
+ SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
+ SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
+ return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
+}
+
bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
// Currently no fixed length shuffles that require SVE are legal.
if (useSVEForFixedLengthVectorVT(VT))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index c346debb823a..72c9e69ce7b8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -863,6 +863,7 @@ class AArch64TargetLowering : public TargetLowering {
unsigned NewOp) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
index 9f3a77c8fe92..a2ab019247e2 100644
--- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
@@ -5,6 +5,50 @@
; SDIV
;
+define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: sdiv_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sunpkhi z2.h, z1.b
+; CHECK-NEXT: sunpkhi z3.h, z0.b
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sunpklo z1.h, z1.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: sunpkhi z4.s, z2.h
+; CHECK-NEXT: sunpkhi z5.s, z3.h
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: sunpkhi z5.s, z1.h
+; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z5.s
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %div = sdiv <vscale x 16 x i8> %a, %b
+ ret <vscale x 16 x i8> %div
+}
+
+define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sdiv_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sunpkhi z2.s, z1.h
+; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+ %div = sdiv <vscale x 8 x i16> %a, %b
+ ret <vscale x 8 x i16> %div
+}
+
define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: sdiv_i32:
; CHECK: // %bb.0:
@@ -63,6 +107,57 @@ define <vscale x 4 x i64> @sdiv_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i
; SREM
;
+define <vscale x 16 x i8> @srem_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: srem_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sunpkhi z2.h, z1.b
+; CHECK-NEXT: sunpkhi z3.h, z0.b
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sunpklo z4.h, z1.b
+; CHECK-NEXT: sunpklo z5.h, z0.b
+; CHECK-NEXT: sunpkhi z6.s, z2.h
+; CHECK-NEXT: sunpkhi z7.s, z3.h
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT: sunpkhi z7.s, z4.h
+; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: sunpkhi z3.s, z5.h
+; CHECK-NEXT: sunpklo z4.s, z4.h
+; CHECK-NEXT: sunpklo z5.s, z5.h
+; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z7.s
+; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h
+; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mul z2.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: sub z0.b, z0.b, z2.b
+; CHECK-NEXT: ret
+ %div = srem <vscale x 16 x i8> %a, %b
+ ret <vscale x 16 x i8> %div
+}
+
+define <vscale x 8 x i16> @srem_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: srem_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sunpkhi z2.s, z1.h
+; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: sunpklo z4.s, z1.h
+; CHECK-NEXT: sunpklo z5.s, z0.h
+; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mul z2.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: sub z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+ %div = srem <vscale x 8 x i16> %a, %b
+ ret <vscale x 8 x i16> %div
+}
+
define <vscale x 4 x i32> @srem_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: srem_i32:
; CHECK: // %bb.0:
@@ -93,6 +188,50 @@ define <vscale x 2 x i64> @srem_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
; UDIV
;
+define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: udiv_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z2.h, z1.b
+; CHECK-NEXT: uunpkhi z3.h, z0.b
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uunpklo z1.h, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z4.s, z2.h
+; CHECK-NEXT: uunpkhi z5.s, z3.h
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: uunpkhi z5.s, z1.h
+; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z5.s
+; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %div = udiv <vscale x 16 x i8> %a, %b
+ ret <vscale x 16 x i8> %div
+}
+
+define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: udiv_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+ %div = udiv <vscale x 8 x i16> %a, %b
+ ret <vscale x 8 x i16> %div
+}
+
define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: udiv_i32:
; CHECK: // %bb.0:
@@ -152,6 +291,57 @@ define <vscale x 4 x i64> @udiv_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i
; UREM
;
+define <vscale x 16 x i8> @urem_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: urem_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z2.h, z1.b
+; CHECK-NEXT: uunpkhi z3.h, z0.b
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uunpklo z4.h, z1.b
+; CHECK-NEXT: uunpklo z5.h, z0.b
+; CHECK-NEXT: uunpkhi z6.s, z2.h
+; CHECK-NEXT: uunpkhi z7.s, z3.h
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT: uunpkhi z7.s, z4.h
+; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: uunpkhi z3.s, z5.h
+; CHECK-NEXT: uunpklo z4.s, z4.h
+; CHECK-NEXT: uunpklo z5.s, z5.h
+; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z7.s
+; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h
+; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mul z2.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: sub z0.b, z0.b, z2.b
+; CHECK-NEXT: ret
+ %div = urem <vscale x 16 x i8> %a, %b
+ ret <vscale x 16 x i8> %div
+}
+
+define <vscale x 8 x i16> @urem_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: urem_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: uunpklo z4.s, z1.h
+; CHECK-NEXT: uunpklo z5.s, z0.h
+; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mul z2.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: sub z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+ %div = urem <vscale x 8 x i16> %a, %b
+ ret <vscale x 8 x i16> %div
+}
+
define <vscale x 4 x i32> @urem_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: urem_i32:
; CHECK: // %bb.0:
More information about the llvm-commits
mailing list