[llvm] r341801 - DAG: Handle odd vector sizes in calling conv splitting
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 10 04:49:23 PDT 2018
Author: arsenm
Date: Mon Sep 10 04:49:23 2018
New Revision: 341801
URL: http://llvm.org/viewvc/llvm-project?rev=341801&view=rev
Log:
DAG: Handle odd vector sizes in calling conv splitting
This already worked if only one register piece was used,
but didn't if a type was split into multiple, unequal
sized pieces.
Fixes not splitting 3i16/v3f16 into two registers for
AMDGPU.
This will also allow fixing the ABI for 16-bit vectors
in a future commit so that it's the same for all subtargets.
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll
llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/function-args.ll
llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll
llvm/trunk/test/CodeGen/AMDGPU/mad-mix-lo.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp?rev=341801&r1=341800&r2=341801&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp Mon Sep 10 04:49:23 2018
@@ -701,33 +701,38 @@ static void getCopyToPartsVector(Selecti
TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
NumIntermediates, RegisterVT);
}
- unsigned NumElements = ValueVT.getVectorNumElements();
assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
NumParts = NumRegs; // Silence a compiler warning.
assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+ unsigned IntermediateNumElts = IntermediateVT.isVector() ?
+ IntermediateVT.getVectorNumElements() : 1;
+
// Convert the vector to the appropiate type if necessary.
- unsigned DestVectorNoElts =
- NumIntermediates *
- (IntermediateVT.isVector() ? IntermediateVT.getVectorNumElements() : 1);
+ unsigned DestVectorNoElts = NumIntermediates * IntermediateNumElts;
+
EVT BuiltVectorTy = EVT::getVectorVT(
*DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);
- if (Val.getValueType() != BuiltVectorTy)
+ MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ if (ValueVT != BuiltVectorTy) {
+ if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy))
+ Val = Widened;
+
Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
+ }
// Split the vector into intermediate operands.
SmallVector<SDValue, 8> Ops(NumIntermediates);
for (unsigned i = 0; i != NumIntermediates; ++i) {
- if (IntermediateVT.isVector())
- Ops[i] =
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
- DAG.getConstant(i * (NumElements / NumIntermediates), DL,
- TLI.getVectorIdxTy(DAG.getDataLayout())));
- else
+ if (IntermediateVT.isVector()) {
+ Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
+ DAG.getConstant(i * IntermediateNumElts, DL, IdxVT));
+ } else {
Ops[i] = DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
- DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+ DAG.getConstant(i, DL, IdxVT));
+ }
}
// Split the intermediate operands into legal parts.
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=341801&r1=341800&r2=341801&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Mon Sep 10 04:49:23 2018
@@ -718,9 +718,7 @@ MVT SITargetLowering::getRegisterTypeFor
if (Size == 64)
return MVT::i32;
- if (Size == 16 &&
- Subtarget->has16BitInsts() &&
- isPowerOf2_32(VT.getVectorNumElements()))
+ if (Size == 16 && Subtarget->has16BitInsts())
return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
}
@@ -741,9 +739,8 @@ unsigned SITargetLowering::getNumRegiste
if (Size == 64)
return 2 * NumElts;
- // FIXME: Fails to break down as we want with v3.
- if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts))
- return VT.getVectorNumElements() / 2;
+ if (Size == 16 && Subtarget->has16BitInsts())
+ return (VT.getVectorNumElements() + 1) / 2;
}
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
@@ -774,10 +771,10 @@ unsigned SITargetLowering::getVectorType
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
- if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) {
+ if (Size == 16 && Subtarget->has16BitInsts()) {
RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
IntermediateVT = RegisterVT;
- NumIntermediates = NumElts / 2;
+ NumIntermediates = (NumElts + 1) / 2;
return NumIntermediates;
}
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll?rev=341801&r1=341800&r2=341801&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll Mon Sep 10 04:49:23 2018
@@ -399,18 +399,35 @@ define amdgpu_kernel void @test_call_ext
ret void
}
-; FIXME: materialize constant directly in VGPR
+; GCN-LABEL: {{^}}test_call_external_void_func_v3f16:
+; GFX9: buffer_load_dwordx2 v[0:1]
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
+ %val = load <3 x half>, <3 x half> addrspace(1)* undef
+ call void @external_void_func_v3f16(<3 x half> %val)
+ ret void
+}
+
; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm:
-; GFX9-DAG: s_mov_b32 [[K01:s[0-9]+]], 0x20001
-; GFX9-DAG: s_mov_b32 [[K2:s[0-9]+]], 3
-; GFX9: v_mov_b32_e32 v0, [[K01]]
-; GFX9: v_mov_b32_e32 v1, [[K2]]
+; GFX9: v_mov_b32_e32 v0, 0x20001
+; GFX9: v_mov_b32_e32 v1, 3
; GFX9: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
ret void
}
+; GCN-LABEL: {{^}}test_call_external_void_func_v3f16_imm:
+; GFX9: v_mov_b32_e32 v0, 0x40003c00
+; GFX9: v_mov_b32_e32 v1, 0x4400
+; GFX9: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
+ call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
+ ret void
+}
+
; GCN-LABEL: {{^}}test_call_external_void_func_v4i16:
; GFX9: buffer_load_dwordx2 v[0:1]
; GFX9-NOT: v0
Modified: llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll?rev=341801&r1=341800&r2=341801&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll Mon Sep 10 04:49:23 2018
@@ -504,16 +504,15 @@ define amdgpu_kernel void @test_fold_can
; FIXME: Extra 4th component handled
; GCN-LABEL: {{^}}v_test_canonicalize_var_v3f16:
; GFX9: s_waitcnt
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: s_setpc_b64
-; VI-DAG: v_max_f16_sdwa [[CANON_ELT3:v[0-9]+]], v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_max_f16_e32 [[CANON_ELT2:v[0-9]+]], v1, v1
; VI-DAG: v_max_f16_sdwa [[CANON_ELT1:v[0-9]+]], v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[CANON_ELT0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 v1, v1, v1
; VI-DAG: v_or_b32_e32 v0, [[CANON_ELT0]], [[CANON_ELT1]]
-; VI-DAG: v_or_b32_e32 v1, [[CANON_ELT2]], [[CANON_ELT3]]
+
; VI: s_setpc_b64
define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
%canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val)
Modified: llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll?rev=341801&r1=341800&r2=341801&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f16.ll Mon Sep 10 04:49:23 2018
@@ -153,8 +153,8 @@ define <3 x half> @test_fmax_legacy_ugt_
; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
; GFX9-NNAN: ; %bb.0:
; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT: v_pk_max_f16 v1, v1, v3
; GFX9-NNAN-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NNAN-NEXT: v_pk_max_f16 v1, v1, v3
; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
Modified: llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll?rev=341801&r1=341800&r2=341801&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f16.ll Mon Sep 10 04:49:23 2018
@@ -154,8 +154,8 @@ define <3 x half> @test_fmin_legacy_ule_
; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
; GFX9-NNAN: ; %bb.0:
; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT: v_pk_min_f16 v1, v1, v3
; GFX9-NNAN-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX9-NNAN-NEXT: v_pk_min_f16 v1, v1, v3
; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
Modified: llvm/trunk/test/CodeGen/AMDGPU/function-args.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/function-args.ll?rev=341801&r1=341800&r2=341801&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/function-args.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/function-args.ll Mon Sep 10 04:49:23 2018
@@ -314,8 +314,17 @@ define void @void_func_v4i16(<4 x i16> %
}
; GCN-LABEL: {{^}}void_func_v5i16:
-; GCN-DAG: buffer_store_short v4, off,
-; GCN-DAG: buffer_store_dwordx2 v[1:2], off
+; CI: v_lshlrev_b32
+; CI: v_and_b32
+; CI: v_lshlrev_b32
+; CI: v_or_b32
+; CI: v_or_b32
+; CI-DAG: buffer_store_short v
+; CI-DAG: buffer_store_dwordx2 v
+
+; GFX89-DAG: buffer_store_short v2, off,
+; GFX89-DAG: buffer_store_dwordx2 v[0:1], off
+
define void @void_func_v5i16(<5 x i16> %arg0) #0 {
store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef
ret void
Modified: llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll?rev=341801&r1=341800&r2=341801&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll Mon Sep 10 04:49:23 2018
@@ -317,13 +317,13 @@ define <4 x half> @v4f16_func_void() #0
ret <4 x half> %val
}
+; FIXME: Mixing buffer and global
; FIXME: Should not scalarize
; GCN-LABEL: {{^}}v5i16_func_void:
; GFX9: buffer_load_dwordx2 v[0:1]
-; GFX9: buffer_load_ushort v4
-; GFX9: v_lshrrev_b32_e32 v5, 16, v0
-; GFX9: v_lshrrev_b32_e32 v3, 16, v1
-; GCN: s_setpc_b64
+; GFX9-NEXT: global_load_short_d16 v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
define <5 x i16> @v5i16_func_void() #0 {
%ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef
%val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
Modified: llvm/trunk/test/CodeGen/AMDGPU/mad-mix-lo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/mad-mix-lo.ll?rev=341801&r1=341800&r2=341801&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/mad-mix-lo.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/mad-mix-lo.ll Mon Sep 10 04:49:23 2018
@@ -94,12 +94,10 @@ define <2 x half> @v_mad_mix_v2f32(<2 x
; GCN-LABEL: {{^}}v_mad_mix_v3f32:
; GCN: s_waitcnt
-; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
-; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX9-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64
define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
%src0.ext = fpext <3 x half> %src0 to <3 x float>
@@ -149,11 +147,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp
; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt:
; GCN: s_waitcnt
; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64
define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
%src0.ext = fpext <3 x half> %src0 to <3 x float>
@@ -246,15 +244,16 @@ define <2 x half> @v_mad_mix_v2f32_clamp
; FIXME: Handling undef 4th component
; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt:
-; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX9: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-
-; GFX9: v_cvt_f16_f32
-; GFX9: v_cvt_f16_f32
-; GFX9: v_cvt_f16_f32
-; GFX9: v_cvt_f16_f32
+; GCN: s_waitcnt
+; GFX9-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v6
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-NEXT: s_setpc_b64
define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
%src0.ext = fpext <3 x half> %src0 to <3 x float>
%src1.ext = fpext <3 x half> %src1 to <3 x float>
More information about the llvm-commits
mailing list