[llvm] r334836 - AMDGPU: Add combine for short vector extract_vector_elts

Fri Jun 15 08:31:36 PDT 2018

Author: arsenm
Date: Fri Jun 15 08:31:36 2018
New Revision: 334836

URL: http://llvm.org/viewvc/llvm-project?rev=334836&view=rev
Log:
AMDGPU: Add combine for short vector extract_vector_elts

Try to access pieces 4 bytes at a time. This helps
various hasOneUse extract_vector_elt combines, such
as load width reductions.

Avoids test regressions in a future commit.

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=334836&r1=334835&r2=334836&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri Jun 15 08:31:36 2018
@@ -7097,8 +7097,11 @@ SDValue SITargetLowering::performCvtPkRT
 SDValue SITargetLowering::performExtractVectorEltCombine(
   SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Vec = N->getOperand(0);
-
   SelectionDAG &DAG = DCI.DAG;
+
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+
   if ((Vec.getOpcode() == ISD::FNEG ||
        Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
     SDLoc SL(N);
@@ -7139,6 +7142,44 @@ SDValue SITargetLowering::performExtract
                                      Vec.getOperand(1), Idx));
     }
   }
+
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned VecSize = VecVT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+
+  // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
+  // elements. This exposes more load reduction opportunities by replacing
+  // multiple small extract_vector_elements with a single 32-bit extract.
+  auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (EltSize <= 16 &&
+      EltVT.isByteSized() &&
+      VecSize > 32 &&
+      VecSize % 32 == 0 &&
+      Idx) {
+    EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
+
+    unsigned BitIndex = Idx->getZExtValue() * EltSize;
+    unsigned EltIdx = BitIndex / 32;
+    unsigned LeftoverBitIdx = BitIndex % 32;
+    SDLoc SL(N);
+
+    SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
+    DCI.AddToWorklist(Cast.getNode());
+
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
+                              DAG.getConstant(EltIdx, SL, MVT::i32));
+    DCI.AddToWorklist(Elt.getNode());
+    SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
+                              DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
+    DCI.AddToWorklist(Srl.getNode());
+
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
+    DCI.AddToWorklist(Trunc.getNode());
+    return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
+  }
+
   return SDValue();
 }
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll?rev=334836&r1=334835&r2=334836&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll Fri Jun 15 08:31:36 2018
@@ -141,6 +141,36 @@ define amdgpu_kernel void @v_inserteleme
   ret void
 }
 
+; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_01:
+; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(<16 x half> addrspace(4)* %ptr) #0 {
+  %load = load <16 x half>, <16 x half> addrspace(4)* %ptr
+  %elt0 = extractelement <16 x half> %load, i32 0
+  %elt1 = extractelement <16 x half> %load, i32 1
+  store volatile half %elt0, half addrspace(1)* undef, align 2
+  store volatile half %elt1, half addrspace(1)* undef, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_23:
+; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(<16 x half> addrspace(4)* %ptr) #0 {
+  %load = load <16 x half>, <16 x half> addrspace(4)* %ptr
+  %elt2 = extractelement <16 x half> %load, i32 2
+  %elt3 = extractelement <16 x half> %load, i32 3
+  store volatile half %elt2, half addrspace(1)* undef, align 2
+  store volatile half %elt3, half addrspace(1)* undef, align 2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }

Modified: llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll?rev=334836&r1=334835&r2=334836&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll Fri Jun 15 08:31:36 2018
@@ -142,6 +142,36 @@ define amdgpu_kernel void @v_inserteleme
   ret void
 }
 
+; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_01:
+; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(<16 x i16> addrspace(4)* %ptr) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
+  %elt0 = extractelement <16 x i16> %load, i32 0
+  %elt1 = extractelement <16 x i16> %load, i32 1
+  store volatile i16 %elt0, i16 addrspace(1)* undef, align 2
+  store volatile i16 %elt1, i16 addrspace(1)* undef, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_23:
+; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(<16 x i16> addrspace(4)* %ptr) #0 {
+  %load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
+  %elt2 = extractelement <16 x i16> %load, i32 2
+  %elt3 = extractelement <16 x i16> %load, i32 3
+  store volatile i16 %elt2, i16 addrspace(1)* undef, align 2
+  store volatile i16 %elt3, i16 addrspace(1)* undef, align 2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }

Modified: llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll?rev=334836&r1=334835&r2=334836&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll Fri Jun 15 08:31:36 2018
@@ -199,4 +199,78 @@ define amdgpu_kernel void @dynamic_extra
   ret void
 }
 
+; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0123:
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_load_dword s
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24
+define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(4)* null
+  %elt0 = extractelement <8 x i8> %load, i32 0
+  %elt1 = extractelement <8 x i8> %load, i32 1
+  %elt2 = extractelement <8 x i8> %load, i32 2
+  %elt3 = extractelement <8 x i8> %load, i32 3
+  store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt2, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt3, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0145:
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_load_dwordx2
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(4)* null
+  %elt0 = extractelement <8 x i8> %load, i32 0
+  %elt1 = extractelement <8 x i8> %load, i32 1
+  %elt4 = extractelement <8 x i8> %load, i32 4
+  %elt5 = extractelement <8 x i8> %load, i32 5
+  store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_45:
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 4{{$}}
+; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}}
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
+  %load = load <8 x i8>, <8 x i8> addrspace(4)* null
+  %elt4 = extractelement <8 x i8> %load, i32 4
+  %elt5 = extractelement <8 x i8> %load, i32 5
+  store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
+; FIXME: ought to be able to eliminate high half of load
+; GCN-LABEL: {{^}}reduce_load_vector_v16i8_extract_0145:
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_load_dwordx4
+; GCN-NOT: {{s|buffer|flat|global}}_load_
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
+define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
+  %load = load <16 x i8>, <16 x i8> addrspace(4)* null
+  %elt0 = extractelement <16 x i8> %load, i32 0
+  %elt1 = extractelement <16 x i8> %load, i32 1
+  %elt4 = extractelement <16 x i8> %load, i32 4
+  %elt5 = extractelement <16 x i8> %load, i32 5
+  store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
+  ret void
+}
+
 attributes #0 = { nounwind }