[llvm] 1dfd1b3 - [AMDGPU] Tune threshold for cmp/select vector lowering
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu May 21 08:59:47 PDT 2020
Author: Stanislav Mekhanoshin
Date: 2020-05-21T08:59:35-07:00
New Revision: 1dfd1b3e4b2b4ba73416bf9fb5cc8fa837116ce3
URL: https://github.com/llvm/llvm-project/commit/1dfd1b3e4b2b4ba73416bf9fb5cc8fa837116ce3
DIFF: https://github.com/llvm/llvm-project/commit/1dfd1b3e4b2b4ba73416bf9fb5cc8fa837116ce3.diff
LOG: [AMDGPU] Tune threshold for cmp/select vector lowering
It was set in total vector size while the idea was to limit
a number of instructions. Now it started to work with doubles
and thresholds needs to be updated.
Differential Revision: https://reviews.llvm.org/D80322
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 492b4f72a055..e3197e5ca3d3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9474,6 +9474,39 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
return SDValue();
}
+// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
+// expanded into a set of cmp/select instructions.
+static bool shouldExpandVectorDynExt(SDNode *N) {
+ SDValue Idx = N->getOperand(N->getNumOperands() - 1);
+ if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx))
+ return false;
+
+ SDValue Vec = N->getOperand(0);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ unsigned VecSize = VecVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+ unsigned NumElem = VecVT.getVectorNumElements();
+
+ // Sub-dword vectors of size 2 dword or less have better implementation.
+ if (VecSize <= 64 && EltSize < 32)
+ return false;
+
+ // Always expand the rest of sub-dword instructions, otherwise it will be
+ // lowered via memory.
+ if (EltSize < 32)
+ return true;
+
+ // Always do this if var-idx is divergent, otherwise it will become a loop.
+ if (Idx->isDivergent())
+ return true;
+
+ // Large vectors would yield too many compares and v_cndmask_b32 instructions.
+ unsigned NumInsts = NumElem /* Number of compares */ +
+ ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
+ return NumInsts <= 16;
+}
+
SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
@@ -9535,15 +9568,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
unsigned EltSize = EltVT.getSizeInBits();
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
- // This elminates non-constant index and subsequent movrel or scratch access.
- // Sub-dword vectors of size 2 dword or less have better implementation.
- // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
- // instructions.
- // Always do this if var-idx is divergent, otherwise it will become a loop.
- if (!UseDivergentRegisterIndexing &&
- (VecSize <= 256 || N->getOperand(1)->isDivergent()) &&
- (VecSize > 64 || EltSize >= 32) &&
- !isa<ConstantSDNode>(N->getOperand(1))) {
+ if (shouldExpandVectorDynExt(N)) {
SDLoc SL(N);
SDValue Idx = N->getOperand(1);
SDValue V;
@@ -9603,19 +9628,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
SDValue Idx = N->getOperand(2);
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
- unsigned VecSize = VecVT.getSizeInBits();
- unsigned EltSize = EltVT.getSizeInBits();
// INSERT_VECTOR_ELT (<n x e>, var-idx)
// => BUILD_VECTOR n x select (e, const-idx)
- // This elminates non-constant index and subsequent movrel or scratch access.
- // Sub-dword vectors of size 2 dword or less have better implementation.
- // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
- // instructions.
- // Always do this if var-idx is divergent, otherwise it will become a loop.
- if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx) ||
- (VecSize > 256 && !Idx->isDivergent()) ||
- (VecSize <= 64 && EltSize < 32))
+ if (!shouldExpandVectorDynExt(N))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 59a913657e46..1ec749059452 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -48,6 +48,24 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}double5_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
+define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
+entry:
+ %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
+ store double %ext, double addrspace(1)* %out
+ ret void
+}
+
; GCN-LABEL: {{^}}half4_extelt:
; GCN-NOT: buffer_
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 876542c19fe8..851d232e00a6 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -277,6 +277,17 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}double5_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-COUNT-10: v_cndmask_b32
+define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
+entry:
+ %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
+ store <5 x double> %v, <5 x double> addrspace(1)* %out
+ ret void
+}
+
; GCN-LABEL: {{^}}double8_inselt:
; GCN-NOT: v_cndmask
; GCN-NOT: buffer_
More information about the llvm-commits
mailing list