[llvm] 5157f98 - [AMDGPU] Enable divergence-driven XNOR selection
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 26 04:30:35 PST 2022
Author: alex-t
Date: 2022-01-26T15:33:10+03:00
New Revision: 5157f984ae2c5e6fe9a8bd3a5dae99d5a96a276d
URL: https://github.com/llvm/llvm-project/commit/5157f984ae2c5e6fe9a8bd3a5dae99d5a96a276d
DIFF: https://github.com/llvm/llvm-project/commit/5157f984ae2c5e6fe9a8bd3a5dae99d5a96a276d.diff
LOG: [AMDGPU] Enable divergence-driven XNOR selection
Currently not (xor_one_use) pattern is always selected to S_XNOR irrelative od the node divergence.
This relies on further custom selection pass which converts to VALU if necessary and replaces with V_NOT_B32 ( V_XOR_B32)
on those targets which have no V_XNOR.
Current change enables the patterns which explicitly select the not (xor_one_use) to appropriate form.
We assume that xor (not) is already turned into the not (xor) by the combiner.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D116270
Added:
llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/lib/Target/AMDGPU/SOPInstructions.td
llvm/lib/Target/AMDGPU/VOP2Instructions.td
llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
llvm/test/CodeGen/AMDGPU/permute.ll
llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
llvm/test/CodeGen/AMDGPU/xnor.ll
llvm/test/CodeGen/AMDGPU/xor3.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 0497847e74316..bec1915705947 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3292,6 +3292,17 @@ class TargetLowering : public TargetLoweringBase {
return false;
}
+ // Lets target to control the following reassociation of operands: (op (op x,
+ // c1), y) -> (op (op x, y), c1) where N0 is (op x, c1) and N1 is y. By
+ // default consider profitable any case where N0 has single use. This
+ // behavior reflects the condition replaced by this target hook call in the
+ // DAGCombiner. Any particular target can implement its own heuristic to
+ // restrict common combiner.
+ virtual bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+ SDValue N1) const {
+ return N0.hasOneUse();
+ }
+
virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
return false;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7e3dbb91f514c..932f263d2558c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1070,7 +1070,7 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
return DAG.getNode(Opc, DL, VT, N00, OpNode);
return SDValue();
}
- if (N0.hasOneUse()) {
+ if (TLI.isReassocProfitable(DAG, N0, N1)) {
// Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
// iff (op x, c1) has one use
if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ec610160b2278..47c9db6627e7b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9639,6 +9639,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
SDValue SITargetLowering::performXorCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
+ return RV;
+
EVT VT = N->getValueType(0);
if (VT != MVT::i64)
return SDValue();
@@ -10551,6 +10554,9 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
+ if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
+ return SDValue();
+
unsigned Opc = N->getOpcode();
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
@@ -10572,12 +10578,6 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
if (Op1->isDivergent())
std::swap(Op1, Op2);
- // If either operand is constant this will conflict with
- // DAGCombiner::ReassociateOps().
- if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
- DAG.isConstantIntBuildVectorOrConstantInt(Op1))
- return SDValue();
-
SDLoc SL(N);
SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
return DAG.getNode(Opc, SL, VT, Add1, Op2);
@@ -12578,3 +12578,27 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
Cost.first += (Size + 255) / 256;
return Cost;
}
+
+bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
+ SDNode::use_iterator I = N->use_begin(), E = N->use_end();
+ for (; I != E; ++I) {
+ if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
+ if (getBasePtrIndex(M) == I.getOperandNo())
+ return true;
+ }
+ }
+ return false;
+}
+
+bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+ SDValue N1) const {
+ if (!N0.hasOneUse())
+ return false;
+ // Take care of the oportunity to keep N0 uniform
+ if (N0->isDivergent() || !N1->isDivergent())
+ return true;
+ // Check if we have a good chance to form the memory access pattern with the
+ // base and offset
+ return (DAG.isBaseWithConstantOffset(N0) &&
+ hasMemSDNodeUser(*N0->use_begin()));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1315cc15dd021..bf81e082b478e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -449,6 +449,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool isSDNodeSourceOfDivergence(const SDNode *N,
FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
+ bool hasMemSDNodeUser(SDNode *N) const;
+
+ bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+ SDValue N1) const override;
+
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
bool isCanonicalized(Register Reg, MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 324d36091827c..3f7837f7dbf11 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -550,11 +550,11 @@ def S_XOR_B64 : SOP2_64 <"s_xor_b64",
>;
def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
- [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))]
+ [(set i32:$sdst, (UniformUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1)))]
>;
def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
- [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
+ [(set i64:$sdst, (UniformUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1)))]
>;
def S_NAND_B32 : SOP2_32 <"s_nand_b32",
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 8d232ffe41141..b9ff814a4dc5c 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -637,9 +637,9 @@ class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
)
>;
-def : divergent_i64_BinOp <and, V_AND_B32_e32>;
-def : divergent_i64_BinOp <or, V_OR_B32_e32>;
-def : divergent_i64_BinOp <xor, V_XOR_B32_e32>;
+def : divergent_i64_BinOp <and, V_AND_B32_e64>;
+def : divergent_i64_BinOp <or, V_OR_B32_e64>;
+def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
let SubtargetPredicate = Has16BitInsts in {
@@ -688,6 +688,36 @@ let SubtargetPredicate = HasDLInsts in {
let isReMaterializable = 1 in
defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>;
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1))),
+ (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+ (i32 (DivergentBinFrag<xor_oneuse> (not i32:$src0), i32:$src1)),
+ (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+ (i64 (DivergentUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1))),
+ (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+ (i32 (EXTRACT_SUBREG $src0, sub0)),
+ (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+ (i32 (V_XNOR_B32_e64
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
+def : GCNPat<
+ (i64 (DivergentBinFrag<xor_oneuse> (not i64:$src0), i64:$src1)),
+ (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+ (i32 (EXTRACT_SUBREG $src0, sub0)),
+ (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+ (i32 (V_XNOR_B32_e64
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
let Constraints = "$vdst = $src2",
DisableEncoding = "$src2",
isConvertibleToThreeAddress = 1,
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
new file mode 100644
index 0000000000000..ec5979c119835
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN_DL %s
+
+; GCN-LABEL: name: uniform_xnor_i64
+; GCN: S_XNOR_B64
+define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+ %xor = xor i64 %a, %b
+ %res = xor i64 %xor, -1
+ store i64 %res, i64 addrspace(1)* %out
+ ret void
+}
+; GCN-LABEL: name: divergent_xnor_i64
+; GCN: V_XOR_B32_e64
+; GCN: V_XOR_B32_e64
+; GCN: V_NOT_B32_e32
+; GCN: V_NOT_B32_e32
+; GCN_DL: V_XNOR_B32_e64
+; GCN_DL: V_XNOR_B32_e64
+define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+ %xor = xor i64 %a, %b
+ %res = xor i64 %xor, -1
+ ret i64 %res
+}
+
+; GCN-LABEL: name: uniform_xnor_i32
+; GCN: S_XNOR_B32
+define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+ %xor = xor i32 %a, %b
+ %res = xor i32 %xor, -1
+ store i32 %res, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: name: divergent_xnor_i32
+; GCN: V_XOR_B32_e64
+; GCN: V_NOT_B32_e32
+; GCN_DL: V_XNOR_B32_e64
+define i32 @divergent_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+ %xor = xor i32 %a, %b
+ %res = xor i32 %xor, -1
+ ret i32 %res
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
index d4b7b7d9cf2f5..6ac24a99fbb14 100644
--- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
+++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -163,8 +163,8 @@ define amdgpu_kernel void @divergent_xor3_b64(<3 x i64> addrspace(1)* %arg) {
; GCN-NEXT: v_xor_b32_e32 v1, v3, v1
; GCN-NEXT: v_xor_b32_e32 v0, v2, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4
; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5
+; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4
; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
; GCN-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll
index d8c20acc666b1..26af6f4d0f2bc 100644
--- a/llvm/test/CodeGen/AMDGPU/permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute.ll
@@ -106,8 +106,11 @@ bb:
}
; GCN-LABEL: {{^}}and_or_or_and:
-; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
-; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff00
+; GCN: s_or_b32 [[SREG:s[0-9]+]], s{{[0-9]+}}, 0xffff0000
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, v{{[0-9]+}}
+; GCN: v_or_b32_e32 v{{[0-9]+}}, [[SREG]], [[VREG]]
+; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -153,10 +156,14 @@ bb:
}
; GCN-LABEL: {{^}}known_ffff0500:
-; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
-; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
-; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
+; GCN: s_and_b32 [[SREG:s[0-9]+]], [[SREG]], 0xff00
+; GCN: s_or_b32 [[SREG]], [[SREG]], 0xffff0000
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, [[VREG]]
+; GCN: v_or_b32_e32 [[VREG]], [[SREG]], [[VREG]]
+; GCN: store_dword v[{{[0-9:]+}}], [[VREG]]{{$}}
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
+; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 6a079bc4e52d8..516c154f996d0 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -472,10 +472,10 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT: v_or_b32_e32 v6, v1, v2
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v1, v2
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll
index 4b2fde48d1a47..bf5cb3ece02cd 100644
--- a/llvm/test/CodeGen/AMDGPU/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xnor.ll
@@ -61,8 +61,8 @@ entry:
; GCN-LABEL: {{^}}vector_xnor_i32_one_use
; GCN-NOT: s_xnor_b32
-; GCN: v_not_b32
; GCN: v_xor_b32
+; GCN: v_not_b32
; GCN-DL: v_xnor_b32
define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) {
entry:
@@ -73,10 +73,10 @@ entry:
; GCN-LABEL: {{^}}vector_xnor_i64_one_use
; GCN-NOT: s_xnor_b64
-; GCN: v_not_b32
-; GCN: v_not_b32
; GCN: v_xor_b32
; GCN: v_xor_b32
+; GCN: v_not_b32
+; GCN: v_not_b32
; GCN-DL: v_xnor_b32
; GCN-DL: v_xnor_b32
define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {
@@ -150,8 +150,8 @@ entry:
; GCN-LABEL: {{^}}vector_xor_na_b_i32_one_use
; GCN-NOT: s_xnor_b32
-; GCN: v_not_b32
; GCN: v_xor_b32
+; GCN: v_not_b32
; GCN-DL: v_xnor_b32
define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) {
entry:
@@ -162,8 +162,8 @@ entry:
; GCN-LABEL: {{^}}vector_xor_a_nb_i32_one_use
; GCN-NOT: s_xnor_b32
-; GCN: v_not_b32
; GCN: v_xor_b32
+; GCN: v_not_b32
; GCN-DL: v_xnor_b32
define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll
index b43fb96de2c16..813a096f1ca34 100644
--- a/llvm/test/CodeGen/AMDGPU/xor3.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3.ll
@@ -26,13 +26,13 @@ define amdgpu_ps float @xor3(i32 %a, i32 %b, i32 %c) {
define amdgpu_ps float @xor3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
; GFX9-LABEL: xor3_vgpr_b:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT: v_xor_b32_e32 v0, s3, v0
+; GFX9-NEXT: s_xor_b32 s0, s3, s2
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: xor3_vgpr_b:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_xor3_b32 v0, s2, v0, s3
+; GFX10-NEXT: v_xor3_b32 v0, s3, s2, v0
; GFX10-NEXT: ; return to shader part epilog
%x = xor i32 %a, %b
%result = xor i32 %x, %c
More information about the llvm-commits
mailing list