[llvm] 02419ab - [AMDGPU] Lower llvm.amdgcn.s.buffer.load.v3[i|f]32
Piotr Sobczak via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 15 06:02:37 PST 2019
Author: Piotr Sobczak
Date: 2019-11-15T15:01:15+01:00
New Revision: 02419ab5c73935bed7aef5fc43e06c6b5f37fc04
URL: https://github.com/llvm/llvm-project/commit/02419ab5c73935bed7aef5fc43e06c6b5f37fc04
DIFF: https://github.com/llvm/llvm-project/commit/02419ab5c73935bed7aef5fc43e06c6b5f37fc04.diff
LOG: [AMDGPU] Lower llvm.amdgcn.s.buffer.load.v3[i|f]32
Summary: Add lowering support for 32-bit vec3 variant of s.buffer.load intrinsic.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70118
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e26f0e3b611d..85af397228e7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5659,11 +5659,16 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
SDValue Offset, SDValue GLC, SDValue DLC,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
+
+ const DataLayout &DataLayout = DAG.getDataLayout();
+ unsigned Align =
+ DataLayout.getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- VT.getStoreSize(), VT.getStoreSize());
+ VT.getStoreSize(), Align);
if (!Offset->isDivergent()) {
SDValue Ops[] = {
@@ -5672,6 +5677,20 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
GLC,
DLC,
};
+
+ // Widen vec3 load to vec4.
+ if (VT.isVector() && VT.getVectorNumElements() == 3) {
+ EVT WidenedVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
+ auto WidenedOp = DAG.getMemIntrinsicNode(
+ AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
+ MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
+ auto Subvector = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
+ DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+ return Subvector;
+ }
+
return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
DAG.getVTList(VT), Ops, VT, MMO);
}
@@ -5683,11 +5702,10 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
MVT LoadVT = VT.getSimpleVT();
unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
assert((LoadVT.getScalarType() == MVT::i32 ||
- LoadVT.getScalarType() == MVT::f32) &&
- isPowerOf2_32(NumElts));
+ LoadVT.getScalarType() == MVT::f32));
if (NumElts == 8 || NumElts == 16) {
- NumLoads = NumElts == 16 ? 4 : 2;
+ NumLoads = NumElts / 4;
LoadVT = MVT::v4i32;
}
@@ -5711,8 +5729,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
for (unsigned i = 0; i < NumLoads; ++i) {
Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
- Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
- Ops, LoadVT, MMO));
+ Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
+ LoadVT, MMO, DAG));
}
if (VT == MVT::v8i32 || VT == MVT::v16i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index 1c450e7c0b9f..4c25ebb617b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -1,8 +1,10 @@
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
-;CHECK-LABEL: {{^}}s_buffer_load_imm:
-;CHECK-NOT: s_waitcnt;
-;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4
+;GCN-LABEL: {{^}}s_buffer_load_imm:
+;GCN-NOT: s_waitcnt;
+;SI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x1
+;VI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4
define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
main_body:
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
@@ -11,9 +13,9 @@ main_body:
ret void
}
-;CHECK-LABEL: {{^}}s_buffer_load_index:
-;CHECK-NOT: s_waitcnt;
-;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+;GCN-LABEL: {{^}}s_buffer_load_index:
+;GCN-NOT: s_waitcnt;
+;GCN: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) {
main_body:
%load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0)
@@ -22,9 +24,21 @@ main_body:
ret void
}
-;CHECK-LABEL: {{^}}s_buffer_loadx2_imm:
-;CHECK-NOT: s_waitcnt;
-;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40
+;GCN-LABEL: {{^}}s_buffer_load_index_divergent:
+;GCN-NOT: s_waitcnt;
+;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+define amdgpu_ps void @s_buffer_load_index_divergent(<4 x i32> inreg %desc, i32 %index) {
+main_body:
+ %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0)
+ %bitcast = bitcast i32 %load to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
+;GCN-LABEL: {{^}}s_buffer_loadx2_imm:
+;GCN-NOT: s_waitcnt;
+;SI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x10
+;VI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40
define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) {
main_body:
%load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0)
@@ -35,9 +49,9 @@ main_body:
ret void
}
-;CHECK-LABEL: {{^}}s_buffer_loadx2_index:
-;CHECK-NOT: s_waitcnt;
-;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+;GCN-LABEL: {{^}}s_buffer_loadx2_index:
+;GCN-NOT: s_waitcnt;
+;GCN: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) {
main_body:
%load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0)
@@ -48,9 +62,67 @@ main_body:
ret void
}
-;CHECK-LABEL: {{^}}s_buffer_loadx4_imm:
-;CHECK-NOT: s_waitcnt;
-;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8
+;GCN-LABEL: {{^}}s_buffer_loadx2_index_divergent:
+;GCN-NOT: s_waitcnt;
+;GCN: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+define amdgpu_ps void @s_buffer_loadx2_index_divergent(<4 x i32> inreg %desc, i32 %index) {
+main_body:
+ %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0)
+ %bitcast = bitcast <2 x i32> %load to <2 x float>
+ %x = extractelement <2 x float> %bitcast, i32 0
+ %y = extractelement <2 x float> %bitcast, i32 1
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
+;GCN-LABEL: {{^}}s_buffer_loadx3_imm:
+;GCN-NOT: s_waitcnt;
+;SI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x10
+;VI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40
+define amdgpu_ps void @s_buffer_loadx3_imm(<4 x i32> inreg %desc) {
+main_body:
+ %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 64, i32 0)
+ %bitcast = bitcast <3 x i32> %load to <3 x float>
+ %x = extractelement <3 x float> %bitcast, i32 0
+ %y = extractelement <3 x float> %bitcast, i32 1
+ %z = extractelement <3 x float> %bitcast, i32 2
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float undef, i1 true, i1 true)
+ ret void
+}
+
+;GCN-LABEL: {{^}}s_buffer_loadx3_index:
+;GCN-NOT: s_waitcnt;
+;GCN: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define amdgpu_ps void @s_buffer_loadx3_index(<4 x i32> inreg %desc, i32 inreg %index) {
+main_body:
+ %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0)
+ %bitcast = bitcast <3 x i32> %load to <3 x float>
+ %x = extractelement <3 x float> %bitcast, i32 0
+ %y = extractelement <3 x float> %bitcast, i32 1
+ %z = extractelement <3 x float> %bitcast, i32 2
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float undef, i1 true, i1 true)
+ ret void
+}
+
+;GCN-LABEL: {{^}}s_buffer_loadx3_index_divergent:
+;GCN-NOT: s_waitcnt;
+;SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+;VI: buffer_load_dwordx3 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+define amdgpu_ps void @s_buffer_loadx3_index_divergent(<4 x i32> inreg %desc, i32 %index) {
+main_body:
+ %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0)
+ %bitcast = bitcast <3 x i32> %load to <3 x float>
+ %x = extractelement <3 x float> %bitcast, i32 0
+ %y = extractelement <3 x float> %bitcast, i32 1
+ %z = extractelement <3 x float> %bitcast, i32 2
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float undef, i1 true, i1 true)
+ ret void
+}
+
+;GCN-LABEL: {{^}}s_buffer_loadx4_imm:
+;GCN-NOT: s_waitcnt;
+;SI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x32
+;VI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8
define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) {
main_body:
%load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0)
@@ -63,9 +135,9 @@ main_body:
ret void
}
-;CHECK-LABEL: {{^}}s_buffer_loadx4_index:
-;CHECK-NOT: s_waitcnt;
-;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+;GCN-LABEL: {{^}}s_buffer_loadx4_index:
+;GCN-NOT: s_waitcnt;
+;GCN: buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) {
main_body:
%load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0)
@@ -78,9 +150,25 @@ main_body:
ret void
}
-;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex2:
-;CHECK-NOT: s_waitcnt;
-;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4
+;GCN-LABEL: {{^}}s_buffer_loadx4_index_divergent:
+;GCN-NOT: s_waitcnt;
+;GCN: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+define amdgpu_ps void @s_buffer_loadx4_index_divergent(<4 x i32> inreg %desc, i32 %index) {
+main_body:
+ %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0)
+ %bitcast = bitcast <4 x i32> %load to <4 x float>
+ %x = extractelement <4 x float> %bitcast, i32 0
+ %y = extractelement <4 x float> %bitcast, i32 1
+ %z = extractelement <4 x float> %bitcast, i32 2
+ %w = extractelement <4 x float> %bitcast, i32 3
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
+ ret void
+}
+
+;GCN-LABEL: {{^}}s_buffer_load_imm_mergex2:
+;GCN-NOT: s_waitcnt;
+;SI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x1
+;VI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4
define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
main_body:
%load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
@@ -91,9 +179,10 @@ main_body:
ret void
}
-;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex4:
-;CHECK-NOT: s_waitcnt;
-;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8
+;GCN-LABEL: {{^}}s_buffer_load_imm_mergex4:
+;GCN-NOT: s_waitcnt;
+;SI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x2
+;VI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8
define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
main_body:
%load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
@@ -108,10 +197,10 @@ main_body:
ret void
}
-;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb:
-;CHECK-NOT: s_waitcnt;
-;CHECK: v_or_b32
-;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+;GCN-LABEL: {{^}}s_buffer_load_index_across_bb:
+;GCN-NOT: s_waitcnt;
+;GCN: v_or_b32
+;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) {
main_body:
%tmp = shl i32 %index, 4
@@ -125,12 +214,12 @@ bb1: ; preds = %main_body
ret void
}
-;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb_merged:
-;CHECK-NOT: s_waitcnt;
-;CHECK: v_or_b32
-;CHECK: v_or_b32
-;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
-;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+;GCN-LABEL: {{^}}s_buffer_load_index_across_bb_merged:
+;GCN-NOT: s_waitcnt;
+;GCN: v_or_b32
+;GCN: v_or_b32
+;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) {
main_body:
%tmp = shl i32 %index, 4
@@ -150,4 +239,5 @@ bb1: ; preds = %main_body
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
+declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32)
declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
More information about the llvm-commits
mailing list