[llvm] r360942 - [CodeGen] Fixed de-optimization of legalize subvector extract

Thu May 16 14:49:06 PDT 2019

Author: tpr
Date: Thu May 16 14:49:06 2019
New Revision: 360942

URL: http://llvm.org/viewvc/llvm-project?rev=360942&view=rev
Log:
[CodeGen] Fixed de-optimization of legalize subvector extract

The recent introduction of v3i32 etc as an MVT, and its use in AMDGPU
3-dword memory instructions, caused a de-optimization problem for code
with such a load that then bitcasts via vector of i8, because v12i8 is
not an MVT so it legalizes the bitcast by widening it.

This commit adds the ability to widen a bitcast using extract_subvector
on the result, so the value does not need to go via memory.

Differential Revision: https://reviews.llvm.org/D60457

Change-Id: Ie4abb7760547e54a2445961992eafc78e80d4b64

Added:
    llvm/trunk/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp?rev=360942&r1=360941&r2=360942&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Thu May 16 14:49:06 2019
@@ -4219,6 +4219,24 @@ SDValue DAGTypeLegalizer::WidenVecOp_BIT
     }
   }
 
+  // Handle a case like bitcast v12i8 -> v3i32. Normally that would get widened
+  // to v16i8 -> v4i32, but for a target where v3i32 is legal but v12i8 is not,
+  // we end up here. Handling the case here with EXTRACT_SUBVECTOR avoids
+  // having to copy via memory.
+  if (VT.isVector()) {
+    EVT EltVT = VT.getVectorElementType();
+    unsigned EltSize = EltVT.getSizeInBits();
+    if (InWidenSize % EltSize == 0) {
+      unsigned NewNumElts = InWidenSize / EltSize;
+      EVT NewVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NewNumElts);
+      if (TLI.isTypeLegal(NewVT)) {
+        SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, BitOp,
+            DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      }
+    }
+  }
+
   return CreateStackStoreLoad(InOp, VT);
 }
 

Added: llvm/trunk/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll?rev=360942&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll Thu May 16 14:49:06 2019
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s -stop-after=amdgpu-isel | FileCheck -check-prefix=GCN %s
+
+; We want to see a BUFFER_LOAD, some register shuffling, and a BUFFER_STORE.
+; Specifically, we do not want to see a BUFFER_STORE that says "store into
+; stack" in the middle.
+
+define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
+  ; GCN-LABEL: name: main
+  ; GCN: bb.0.main_body:
+  ; GCN:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
+  ; GCN:   [[DEF:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+  ; GCN:   [[DEF1:%[0-9]+]]:sreg_128 = IMPLICIT_DEF
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
+  ; GCN:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
+  ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_96 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY1]], %subreg.sub2
+  ; GCN:   [[COPY4:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]]
+  ; GCN:   [[DEF2:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF
+  ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+  ; GCN:   [[DEF3:%[0-9]+]]:sreg_128 = IMPLICIT_DEF
+  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   S_ENDPGM 0
+main_body:
+  %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0)
+  %tmp27 = bitcast <4 x float> %tmp25 to <16 x i8>
+  %tmp28 = shufflevector <16 x i8> %tmp27, <16 x i8> undef, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %tmp29 = bitcast <12 x i8> %tmp28 to <3 x i32>
+  call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %tmp29, <4 x i32> undef, i32 undef, i32 0, i32 0) #3
+  ret void
+}
+
+declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg)
+declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32 immarg)
+