[llvm] 6d5d8b1 - [AMDGPU] gfx11 ldsdir intrinsics and ISel
Joe Nash via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 17 06:32:42 PDT 2022
Author: Joe Nash
Date: 2022-06-17T09:03:16-04:00
New Revision: 6d5d8b131300284d5c26f73017da612db532dc9a
URL: https://github.com/llvm/llvm-project/commit/6d5d8b131300284d5c26f73017da612db532dc9a
DIFF: https://github.com/llvm/llvm-project/commit/6d5d8b131300284d5c26f73017da612db532dc9a.diff
LOG: [AMDGPU] gfx11 ldsdir intrinsics and ISel
Reviewed By: #amdgpu, rampitec
Differential Revision: https://reviews.llvm.org/D127664
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 5b52d70fe6798..c2a6534def950 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1488,6 +1488,23 @@ def int_amdgcn_interp_p2_f16 :
[IntrNoMem, IntrSpeculatable, IntrWillReturn,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
+// __int_amdgcn_lds_direct_load <m0>
+// The input argument is m0, which contains a packed combination of address
+// offset and flags describing the data type.
+def int_amdgcn_lds_direct_load :
+ Intrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16
+ [llvm_i32_ty],
+ [IntrReadMem, IntrSpeculatable, IntrWillReturn]>;
+
+// __int_amdgcn_lds_param_load <attr_chan>, <attr>, <m0>
+// Like interp intrinsics, this reads from lds, but the memory values are constant,
+// so it behaves like IntrNoMem.
+def int_amdgcn_lds_param_load :
+ Intrinsic<[llvm_float_ty],
+ [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+ ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+
// Deprecated: use llvm.amdgcn.live.mask instead.
def int_amdgcn_ps_live : Intrinsic <
[llvm_i1_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 02a25a7d5778b..8f046d07ed066 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3008,7 +3008,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_interp_p2:
case Intrinsic::amdgcn_interp_mov:
case Intrinsic::amdgcn_interp_p1_f16:
- case Intrinsic::amdgcn_interp_p2_f16: {
+ case Intrinsic::amdgcn_interp_p2_f16:
+ case Intrinsic::amdgcn_lds_param_load: {
applyDefaultMapping(OpdMapper);
// Readlane for m0 value, which is always the last operand.
@@ -3120,6 +3121,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(MI, MRI, 8); // M0
return;
+ case Intrinsic::amdgcn_lds_direct_load: {
+ applyDefaultMapping(OpdMapper);
+ // Readlane for m0 value, which is always the last operand.
+ constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4446,7 +4453,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_interp_p2:
case Intrinsic::amdgcn_interp_mov:
case Intrinsic::amdgcn_interp_p1_f16:
- case Intrinsic::amdgcn_interp_p2_f16: {
+ case Intrinsic::amdgcn_interp_p2_f16:
+ case Intrinsic::amdgcn_lds_param_load: {
const int M0Idx = MI.getNumOperands() - 1;
Register M0Reg = MI.getOperand(M0Idx).getReg();
unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
@@ -4678,6 +4686,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_lds_direct_load: {
+ const int M0Idx = MI.getNumOperands() - 1;
+ Register M0Reg = MI.getOperand(M0Idx).getReg();
+ unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+ for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
+ OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+
+ // Must be SGPR, but we must take whatever the original bank is and fix it
+ // later.
+ OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
+ break;
+ }
case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 833c1dcdf018f..8297635d7bb2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -230,6 +230,8 @@ def : SourceOfDivergence<int_amdgcn_interp_p1>;
def : SourceOfDivergence<int_amdgcn_interp_p2>;
def : SourceOfDivergence<int_amdgcn_interp_p1_f16>;
def : SourceOfDivergence<int_amdgcn_interp_p2_f16>;
+def : SourceOfDivergence<int_amdgcn_lds_direct_load>;
+def : SourceOfDivergence<int_amdgcn_lds_param_load>;
def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
index eabd387caab2a..1f65376890da9 100644
--- a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
+++ b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
@@ -91,6 +91,16 @@ class LDSDIR_Real<bits<2> op, LDSDIR_Pseudo lds, int subtarget> :
def LDS_DIRECT_LOAD : LDSDIR_Pseudo<"lds_direct_load", 1>;
def LDS_PARAM_LOAD : LDSDIR_Pseudo<"lds_param_load", 0>;
+def : GCNPat <
+ (f32 (int_amdgcn_lds_direct_load M0)),
+ (LDS_DIRECT_LOAD 0)
+>;
+
+def : GCNPat <
+ (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)),
+ (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0)
+>;
+
//===----------------------------------------------------------------------===//
// GFX11+
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir
new file mode 100644
index 0000000000000..5309d8f66fdd5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: lds_direct_load_s
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0
+ ; CHECK-LABEL: name: lds_direct_load_s
+ ; CHECK: liveins: $sgpr0
+ ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), [[COPY]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), %0
+...
+
+---
+name: lds_direct_load_v
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; CHECK-LABEL: name: lds_direct_load_v
+ ; CHECK: liveins: $vgpr0
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec
+ ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), [[V_READFIRSTLANE_B32_]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), %0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir
new file mode 100644
index 0000000000000..8d4624b29aa9c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: lds_param_load_s
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0
+ ; CHECK-LABEL: name: lds_param_load_s
+ ; CHECK: liveins: $sgpr0
+ ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, [[COPY]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, %0
+...
+
+---
+name: lds_param_load_v
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; CHECK-LABEL: name: lds_param_load_v
+ ; CHECK: liveins: $vgpr0
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec
+ ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, [[V_READFIRSTLANE_B32_]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, %0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
new file mode 100644
index 0000000000000..0551d2cf7bb89
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+
+; GFX11-LABEL: {{^}}lds_direct_load:
+; GFX11: s_mov_b32 m0
+; GFX11: lds_direct_load v{{[0-9]+}}
+; GFX11: s_mov_b32 m0
+; GFX11: lds_direct_load v{{[0-9]+}}
+; GFX11: s_mov_b32 m0
+; GFX11: lds_direct_load v{{[0-9]+}}
+; GFX11: v_add_f32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+define amdgpu_ps void @lds_direct_load(<4 x i32> inreg %buf, i32 inreg %arg0,
+ i32 inreg %arg1, i32 inreg %arg2) #0 {
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.direct.load(i32 %arg0)
+ ; Ensure memory clustering is occuring for lds_direct_load
+ %p5 = fadd float %p0, 1.0
+ %p1 = call float @llvm.amdgcn.lds.direct.load(i32 %arg1)
+ %p2 = call float @llvm.amdgcn.lds.direct.load(i32 %arg2)
+ %p3 = call float @llvm.amdgcn.lds.direct.load(i32 %arg1)
+ %p4 = call float @llvm.amdgcn.lds.direct.load(i32 %arg2)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p5, <4 x i32> %buf, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p1, <4 x i32> %buf, i32 4, i32 1, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p2, <4 x i32> %buf, i32 4, i32 2, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p3, <4 x i32> %buf, i32 4, i32 3, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p4, <4 x i32> %buf, i32 4, i32 4, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p0, <4 x i32> %buf, i32 4, i32 5, i32 0)
+ ret void
+}
+
+declare float @llvm.amdgcn.lds.direct.load(i32) #1
+declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
new file mode 100644
index 0000000000000..6361a02e03a96
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+
+; GFX11-LABEL: {{^}}lds_param_load:
+; GFX11: s_mov_b32 m0
+; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.x
+; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.y
+; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.z
+; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.w
+; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr1.x
+; GFX11: v_add_f32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+; GFX11: buffer_store_b32
+define amdgpu_ps void @lds_param_load(<4 x i32> inreg %buf, i32 inreg %arg) #0 {
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %arg)
+ ; Ensure memory clustering is occuring for lds_param_load
+ %p5 = fadd float %p0, 1.0
+ %p1 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %arg)
+ %p2 = call float @llvm.amdgcn.lds.param.load(i32 2, i32 0, i32 %arg)
+ %p3 = call float @llvm.amdgcn.lds.param.load(i32 3, i32 0, i32 %arg)
+ %p4 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %arg)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p5, <4 x i32> %buf, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p1, <4 x i32> %buf, i32 4, i32 1, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p2, <4 x i32> %buf, i32 4, i32 2, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p3, <4 x i32> %buf, i32 4, i32 3, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p4, <4 x i32> %buf, i32 4, i32 4, i32 0)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %p0, <4 x i32> %buf, i32 4, i32 5, i32 0)
+ ret void
+}
+
+declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1
+declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
More information about the llvm-commits
mailing list