[llvm] 1121eca - [VP][VE] Default VP_SREM/UREM to Expand and add generic expansion using VP_SDIV/UDIV+VP_MUL+VP_SUB.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 16 13:19:11 PDT 2022
Author: Craig Topper
Date: 2022-09-16T13:19:02-07:00
New Revision: 1121eca685a39c1fc370daa65ff918a3cd490dc4
URL: https://github.com/llvm/llvm-project/commit/1121eca685a39c1fc370daa65ff918a3cd490dc4
DIFF: https://github.com/llvm/llvm-project/commit/1121eca685a39c1fc370daa65ff918a3cd490dc4.diff
LOG: [VP][VE] Default VP_SREM/UREM to Expand and add generic expansion using VP_SDIV/UDIV+VP_MUL+VP_SUB.
I want to default all VP operations to Expand. These 2 were blocking
because VE doesn't support them and the tests were expecting them
to fail a specific way. Using Expand caused them to fail differently.
Seemed better to emulate them using operations that are supported.
@simoll mentioned on Discord that VE has some expansion downstream. Not
sure if its done like this or in the VE target.
Reviewed By: frasercrmck, efocht
Differential Revision: https://reviews.llvm.org/D133514
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
llvm/lib/CodeGen/TargetLoweringBase.cpp
llvm/test/CodeGen/VE/Vector/vp_srem.ll
llvm/test/CodeGen/VE/Vector/vp_urem.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 4eafb8e9013fc..0132bf4a8affb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -132,6 +132,7 @@ class VectorLegalizer {
SDValue ExpandVSELECT(SDNode *Node);
SDValue ExpandVP_SELECT(SDNode *Node);
SDValue ExpandVP_MERGE(SDNode *Node);
+ SDValue ExpandVP_REM(SDNode *Node);
SDValue ExpandSELECT(SDNode *Node);
std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
SDValue ExpandStore(SDNode *N);
@@ -735,6 +736,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
case ISD::VP_SELECT:
Results.push_back(ExpandVP_SELECT(Node));
return;
+ case ISD::VP_SREM:
+ case ISD::VP_UREM:
+ if (SDValue Expanded = ExpandVP_REM(Node)) {
+ Results.push_back(Expanded);
+ return;
+ }
+ break;
case ISD::SELECT:
Results.push_back(ExpandSELECT(Node));
return;
@@ -1310,6 +1318,30 @@ SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) {
return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2);
}
+SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) {
+ // Implement VP_SREM/UREM in terms of VP_SDIV/VP_UDIV, VP_MUL, VP_SUB.
+ EVT VT = Node->getValueType(0);
+
+ unsigned DivOpc = Node->getOpcode() == ISD::VP_SREM ? ISD::VP_SDIV : ISD::VP_UDIV;
+
+ if (!TLI.isOperationLegalOrCustom(DivOpc, VT) ||
+ !TLI.isOperationLegalOrCustom(ISD::VP_MUL, VT) ||
+ !TLI.isOperationLegalOrCustom(ISD::VP_SUB, VT))
+ return SDValue();
+
+ SDLoc DL(Node);
+
+ SDValue Dividend = Node->getOperand(0);
+ SDValue Divisor = Node->getOperand(1);
+ SDValue Mask = Node->getOperand(2);
+ SDValue EVL = Node->getOperand(3);
+
+ // X % Y -> X-X/Y*Y
+ SDValue Div = DAG.getNode(DivOpc, DL, VT, Dividend, Divisor, Mask, EVL);
+ SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, Divisor, Div, Mask, EVL);
+ return DAG.getNode(ISD::VP_SUB, DL, VT, Dividend, Mul, Mask, EVL);
+}
+
void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
SmallVectorImpl<SDValue> &Results) {
// Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 8eaf1bb1b3972..9736318486626 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -870,6 +870,11 @@ void TargetLoweringBase::initActions() {
// Named vector shuffles default to expand.
setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
+
+ // VP_SREM/UREM default to expand.
+ // TODO: Expand all VP intrinsics.
+ setOperationAction(ISD::VP_SREM, VT, Expand);
+ setOperationAction(ISD::VP_UREM, VT, Expand);
}
// Most targets ignore the @llvm.prefetch intrinsic.
diff --git a/llvm/test/CodeGen/VE/Vector/vp_srem.ll b/llvm/test/CodeGen/VE/Vector/vp_srem.ll
index 15a3dcaf288aa..ffd93e340ebdc 100644
--- a/llvm/test/CodeGen/VE/Vector/vp_srem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vp_srem.ll
@@ -1,16 +1,95 @@
-; REQUIRES: asserts
-; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
-; CHECK: t{{[0-9]+}}: v256i32 = vp_srem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]]
-; CHECK: [[A]]: v256i32
-; CHECK: [[B]]: v256i32
-; CHECK: [[MASK]]: v256i1
-; CHECK: [[EVL]]: i32
+declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
-define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+define fastcc <256 x i32> @test_vp_srem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vdivs.w.sx %v2, %v0, %v1, %vm1
+; CHECK-NEXT: vmuls.w.sx %v1, %v1, %v2, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
%r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
ret <256 x i32> %r0
}
-; integer arith
-declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
+define fastcc <256 x i32> @test_vp_srem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_rv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivs.w.sx %v1, %s0, %v0, %vm1
+; CHECK-NEXT: vmuls.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %s0, %v0, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %xins = insertelement <256 x i32> undef, i32 %s0, i32 0
+ %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_srem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_vr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivs.w.sx %v1, %v0, %s0, %vm1
+; CHECK-NEXT: vmuls.w.sx %v1, %s0, %v1, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %yins = insertelement <256 x i32> undef, i32 %s1, i32 0
+ %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i32> %r0
+}
+
+
+declare <256 x i64> @llvm.vp.srem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)
+
+define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_int_v256i64_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vdivs.l %v2, %v0, %v1, %vm1
+; CHECK-NEXT: vmuls.l %v1, %v1, %v2, %vm1
+; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_srem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i64_rv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivs.l %v1, %s0, %v0, %vm1
+; CHECK-NEXT: vmuls.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: vsubs.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %xins = insertelement <256 x i64> undef, i64 %s0, i32 0
+ %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_srem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i64_vr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivs.l %v1, %v0, %s0, %vm1
+; CHECK-NEXT: vmuls.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %yins = insertelement <256 x i64> undef, i64 %s1, i32 0
+ %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vp_urem.ll b/llvm/test/CodeGen/VE/Vector/vp_urem.ll
index 5465e9fdb1439..0620c89781443 100644
--- a/llvm/test/CodeGen/VE/Vector/vp_urem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vp_urem.ll
@@ -1,16 +1,95 @@
-; REQUIRES: asserts
-; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
-; CHECK: t{{[0-9]+}}: v256i32 = vp_urem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]]
-; CHECK: [[A]]: v256i32
-; CHECK: [[B]]: v256i32
-; CHECK: [[MASK]]: v256i1
-; CHECK: [[EVL]]: i32
+declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
-define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+define fastcc <256 x i32> @test_vp_urem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vdivu.w %v2, %v0, %v1, %vm1
+; CHECK-NEXT: vmuls.w.sx %v1, %v1, %v2, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
%r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
ret <256 x i32> %r0
}
-; integer arith
-declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
+define fastcc <256 x i32> @test_vp_urem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_rv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivu.w %v1, %s0, %v0, %vm1
+; CHECK-NEXT: vmuls.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %s0, %v0, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %xins = insertelement <256 x i32> undef, i32 %s0, i32 0
+ %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_urem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_vr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivu.w %v1, %v0, %s0, %vm1
+; CHECK-NEXT: vmuls.w.sx %v1, %s0, %v1, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %yins = insertelement <256 x i32> undef, i32 %s1, i32 0
+ %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i32> %r0
+}
+
+
+declare <256 x i64> @llvm.vp.urem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)
+
+define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_int_v256i64_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vdivu.l %v2, %v0, %v1, %vm1
+; CHECK-NEXT: vmuls.l %v1, %v1, %v2, %vm1
+; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_urem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i64_rv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivu.l %v1, %s0, %v0, %vm1
+; CHECK-NEXT: vmuls.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: vsubs.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %xins = insertelement <256 x i64> undef, i64 %s0, i32 0
+ %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_urem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i64_vr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivu.l %v1, %v0, %s0, %vm1
+; CHECK-NEXT: vmuls.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %yins = insertelement <256 x i64> undef, i64 %s1, i32 0
+ %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
More information about the llvm-commits
mailing list