[llvm] r341018 - DAG: Don't use ABI copies in some contexts

Wed Aug 29 22:49:28 PDT 2018

Author: arsenm
Date: Wed Aug 29 22:49:28 2018
New Revision: 341018

URL: http://llvm.org/viewvc/llvm-project?rev=341018&view=rev
Log:
DAG: Don't use ABI copies in some contexts

If an ABI-like value is used in a different block,
the type split used is not necessarily the same as
the call's ABI. The value is used through an intermediate
copy virtual registers from the other block. This
resulted in copies with inconsistent sizes later.

Fixes regressions since r338197 when AMDGPU started
splitting vector types for calls.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp?rev=341018&r1=341017&r2=341018&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp Wed Aug 29 22:49:28 2018
@@ -1178,7 +1178,8 @@ SDValue SelectionDAGBuilder::getCopyFrom
     unsigned InReg = It->second;
 
     RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
-                     DAG.getDataLayout(), InReg, Ty, getABIRegCopyCC(V));
+                     DAG.getDataLayout(), InReg, Ty,
+                     None); // This is not an ABI copy.
     SDValue Chain = DAG.getEntryNode();
     Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
                                  V);
@@ -8696,7 +8697,7 @@ SelectionDAGBuilder::CopyValueToVirtualR
   // notional registers required by the type.
 
   RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg, V->getType(),
-                   getABIRegCopyCC(V));
+                   None); // This is not an ABI copy.
   SDValue Chain = DAG.getEntryNode();
 
   ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==

Added: llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll?rev=341018&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll Wed Aug 29 22:49:28 2018
@@ -0,0 +1,176 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; SelectionDAG builder was using the IR value kind to decide how to
+; split the types for copyToRegs/copyFromRegs in all contexts. This
+; was incorrect if the ABI-like value such as a call was used outside
+; of the block. The value in that case is not used directly, but
+; through another set of copies to potentially different register
+; types in the parent block.
+
+; This would then end up producing inconsistent pairs of copies with
+; the wrong sizes when the vector type result from the call was split
+; into multiple pieces, but expected to be a single register in the
+; cross-block copy.
+;
+; This isn't exactly ideal for AMDGPU, since in reality the
+; intermediate vector register type is undesirable anyway, but it
+; requires more work to be able to split all vector copies in all
+; contexts.
+;
+; This was only an issue if the value was used directly in another
+; block. If there was an intermediate operation or a phi it was fine,
+; since that didn't look like an ABI copy.
+
+
+define float @call_split_type_used_outside_block_v2f32() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_v2f32:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v2f32 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v2f32 at rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <2 x float> @func_v2f32()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <2 x float> %split.ret.type, i32 0
+  ret float %extract
+}
+
+define float @call_split_type_used_outside_block_v3f32() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_v3f32:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v3f32 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v3f32 at rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <3 x float> @func_v3f32()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <3 x float> %split.ret.type, i32 0
+  ret float %extract
+}
+
+define half @call_split_type_used_outside_block_v4f16() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_v4f16:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v4f16 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v4f16 at rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <4 x half> @func_v4f16()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <4 x half> %split.ret.type, i32 0
+  ret half %extract
+}
+
+define { i32, half } @call_split_type_used_outside_block_struct() #0 {
+; GCN-LABEL: call_split_type_used_outside_block_struct:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v32, s33, 0
+; GCN-NEXT:    v_writelane_b32 v32, s34, 1
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v32, s35, 2
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_struct at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_struct at rel32@hi+4
+; GCN-NEXT:    s_mov_b64 s[34:35], s[30:31]
+; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b64 s[30:31], s[34:35]
+; GCN-NEXT:    v_readlane_b32 s35, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v32, 1
+; GCN-NEXT:    v_readlane_b32 s33, v32, 0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
+  br label %bb1
+
+bb1:
+  %val0 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 0
+  %val1 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 1
+  %extract0 = extractelement <4 x i32> %val0, i32 0
+  %extract1 = extractelement <4 x half> %val1, i32 0
+  %ins0 = insertvalue { i32, half } undef, i32 %extract0, 0
+  %ins1 = insertvalue { i32, half } %ins0, half %extract1, 1
+  ret { i32, half } %ins1
+}
+
+
+declare <2 x float> @func_v2f32() #0
+declare <3 x float> @func_v3f32() #0
+declare <4 x float> @func_v4f32() #0
+declare <4 x half> @func_v4f16() #0
+
+declare { <4 x i32>, <4 x half> } @func_struct() #0
+
+attributes #0 = { nounwind}