[llvm] 1dfd1b3 - [AMDGPU] Tune threshold for cmp/select vector lowering

Thu May 21 08:59:47 PDT 2020

Author: Stanislav Mekhanoshin
Date: 2020-05-21T08:59:35-07:00
New Revision: 1dfd1b3e4b2b4ba73416bf9fb5cc8fa837116ce3

URL: https://github.com/llvm/llvm-project/commit/1dfd1b3e4b2b4ba73416bf9fb5cc8fa837116ce3
DIFF: https://github.com/llvm/llvm-project/commit/1dfd1b3e4b2b4ba73416bf9fb5cc8fa837116ce3.diff

LOG: [AMDGPU] Tune threshold for cmp/select vector lowering

It was set in total vector size while the idea was to limit
a number of instructions. Now it started to work with doubles
and thresholds needs to be updated.

Differential Revision: https://reviews.llvm.org/D80322

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 492b4f72a055..e3197e5ca3d3 100644

--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9474,6 +9474,39 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
   return SDValue();
 }
 
+// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
+// expanded into a set of cmp/select instructions.
+static bool shouldExpandVectorDynExt(SDNode *N) {
+  SDValue Idx = N->getOperand(N->getNumOperands() - 1);
+  if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx))
+    return false;
+
+  SDValue Vec = N->getOperand(0);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned VecSize = VecVT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+  unsigned NumElem = VecVT.getVectorNumElements();
+
+  // Sub-dword vectors of size 2 dword or less have better implementation.
+  if (VecSize <= 64 && EltSize < 32)
+    return false;
+
+  // Always expand the rest of sub-dword instructions, otherwise it will be
+  // lowered via memory.
+  if (EltSize < 32)
+    return true;
+
+  // Always do this if var-idx is divergent, otherwise it will become a loop.
+  if (Idx->isDivergent())
+    return true;
+
+  // Large vectors would yield too many compares and v_cndmask_b32 instructions.
+  unsigned NumInsts = NumElem /* Number of compares */ +
+                      ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
+  return NumInsts <= 16;
+}
+
 SDValue SITargetLowering::performExtractVectorEltCombine(
   SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Vec = N->getOperand(0);
@@ -9535,15 +9568,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
   unsigned EltSize = EltVT.getSizeInBits();
 
   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
-  // This elminates non-constant index and subsequent movrel or scratch access.
-  // Sub-dword vectors of size 2 dword or less have better implementation.
-  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
-  // instructions.
-  // Always do this if var-idx is divergent, otherwise it will become a loop.
-  if (!UseDivergentRegisterIndexing &&
-      (VecSize <= 256 || N->getOperand(1)->isDivergent()) &&
-      (VecSize > 64 || EltSize >= 32) &&
-      !isa<ConstantSDNode>(N->getOperand(1))) {
+  if (shouldExpandVectorDynExt(N)) {
     SDLoc SL(N);
     SDValue Idx = N->getOperand(1);
     SDValue V;
@@ -9603,19 +9628,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
   SDValue Idx = N->getOperand(2);
   EVT VecVT = Vec.getValueType();
   EVT EltVT = VecVT.getVectorElementType();
-  unsigned VecSize = VecVT.getSizeInBits();
-  unsigned EltSize = EltVT.getSizeInBits();
 
   // INSERT_VECTOR_ELT (<n x e>, var-idx)
   // => BUILD_VECTOR n x select (e, const-idx)
-  // This elminates non-constant index and subsequent movrel or scratch access.
-  // Sub-dword vectors of size 2 dword or less have better implementation.
-  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
-  // instructions.
-  // Always do this if var-idx is divergent, otherwise it will become a loop.
-  if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx) ||
-      (VecSize > 256 && !Idx->isDivergent()) ||
-      (VecSize <= 64 && EltSize < 32))
+  if (!shouldExpandVectorDynExt(N))
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 59a913657e46..1ec749059452 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -48,6 +48,24 @@ entry:
   ret void
 }
 
+; GCN-LABEL: {{^}}double5_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
+define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
+  store double %ext, double addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}half4_extelt:
 ; GCN-NOT: buffer_
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 876542c19fe8..851d232e00a6 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -277,6 +277,17 @@ entry:
   ret void
 }
 
+; GCN-LABEL: {{^}}double5_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-COUNT-10: v_cndmask_b32
+define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
+entry:
+  %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
+  store <5 x double> %v, <5 x double> addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}double8_inselt:
 ; GCN-NOT: v_cndmask
 ; GCN-NOT: buffer_