[llvm] AMDGPU: Update the description of cache policy for buffer intrinsics, NFC (PR #87364)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 2 09:29:43 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Changpeng Fang (changpeng)
<details>
<summary>Changes</summary>
Explicitly added gfx940, which has SC and NT bits. There must be better ways of documenting the cache policy. But this is what I can do now.
Fixes: SWDEV-449810
---
Patch is 46.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/87364.diff
1 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+233-285)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 3de20bb44e0c1b..6bbc13f1de86e2 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -851,16 +851,19 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
list<SDNodeProperty> sdnodeprops> : Intrinsic<
P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return
!listconcat(
- !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic
- !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm)
- P_.AddrTypes, // vaddr(VGPR)
- [llvm_v8i32_ty], // rsrc(SGPR)
- !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR)
- llvm_i1_ty], []), // unorm(imm)
- [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
- llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc;
- // gfx12+ imm: bits [0-2] = th, bits [3-4] = scope)
-
+ !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic
+ !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm)
+ P_.AddrTypes, // vaddr(VGPR)
+ [llvm_v8i32_ty], // rsrc(SGPR)
+ !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR)
+ llvm_i1_ty], []), // unorm(imm)
+ [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
+ llvm_i32_ty]), // auxiliary/cachepolicy(imm):
+ // bit 0 = glc, bit 1 = slc,
+ // bit 2 = dlc (gfx10/gfx11),
+ // bit 4 = scc (gfx90a)
+ // gfx940: bit 0 = sc0, bit 1 = nt, bit 4 = sc1
+ // gfx12+: bits [0-2] = th, bits [3-4] = scope
!listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn],
!if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
!if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
@@ -1085,11 +1088,15 @@ def int_amdgcn_buffer_load : AMDGPUBufferLoad;
// the offset argument is uniform.
def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
[llvm_any_ty],
- [llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // byte offset
- llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc;
- // gfx12+ imm: bits [0-2] = th, bits [3-4] = scope)
- // Note: volatile bit is **not** permitted here.
+ [llvm_v4i32_ty, // rsrc(SGPR)
+ llvm_i32_ty, // byte offset
+ llvm_i32_ty], // auxiliary/cachepolicy(imm):
+ // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+ // bit 3 = swz, bit 4 = scc (gfx90a)
+ // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+ // gfx12+: bits [0-2] = th, bits [3-4] = scope,
+ // bit 6 = swz
+ // Note: volatile bit is **not** permitted here.
[IntrNoMem, ImmArg<ArgIndex<2>>]>,
AMDGPURsrcIntrinsic<0>;
@@ -1123,19 +1130,16 @@ def int_amdgcn_buffer_store : AMDGPUBufferStore;
// operation is volatile.
class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[data_ty],
- [llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
- llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
- llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
- // bit 1 = slc,
- // bit 2 = dlc on gfx10/gfx11),
- // swizzled buffer (bit 3 = swz),
- // gfx12+:
- // cachepolicy (bits [0-2] = th,
- // bits [3-4] = scope)
- // swizzled buffer (bit 6 = swz),
- // all:
- // volatile op (bit 31, stripped at lowering))
+ [llvm_v4i32_ty, // rsrc(SGPR)
+ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
+ llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+ llvm_i32_ty], // auxiliary/cachepolicy(imm):
+ // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+ // bit 3 = swz, bit 4 = scc (gfx90a)
+ // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+ // gfx12+: bits [0-2] = th, bits [3-4] = scope,
+ // bit 6 = swz
+ // all: volatile op (bit 31, stripped at lowering)
[IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
@@ -1143,20 +1147,16 @@ def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[data_ty],
- [AMDGPUBufferRsrcTy, // rsrc(SGPR)
- llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
- llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
- llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
- // bit 1 = slc,
- // bit 2 = dlc on gfx10/gfx11),
- // swizzled buffer (bit 3 = swz),
- // gfx12+:
- // cachepolicy (bits [0-2] = th,
- // bits [3-4] = scope)
- // swizzled buffer (bit 6 = swz),
- // all:
- // volatile op (bit 31, stripped at lowering))
-
+ [AMDGPUBufferRsrcTy, // rsrc(SGPR)
+ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
+ llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+ llvm_i32_ty], // auxiliary/cachepolicy(imm):
+ // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+ // bit 3 = swz, bit 4 = scc (gfx90a)
+ // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+ // gfx12+: bits [0-2] = th, bits [3-4] = scope,
+ // bit 6 = swz
+ // all: volatile op (bit 31, stripped at lowering)
[IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
@@ -1165,20 +1165,17 @@ def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad;
class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[data_ty],
- [llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
- llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
- llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
- // bit 1 = slc,
- // bit 2 = dlc on gfx10/gfx11),
- // swizzled buffer (bit 3 = swz),
- // gfx12+:
- // cachepolicy (bits [0-2] = th,
- // bits [3-4] = scope)
- // swizzled buffer (bit 6 = swz),
- // all:
- // volatile op (bit 31, stripped at lowering))
+ [llvm_v4i32_ty, // rsrc(SGPR)
+ llvm_i32_ty, // vindex(VGPR)
+ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
+ llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+ llvm_i32_ty], // auxiliary/cachepolicy(imm):
+ // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+ // bit 3 = swz, bit 4 = scc (gfx90a)
+ // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+ // gfx12+: bits [0-2] = th, bits [3-4] = scope,
+ // bit 6 = swz
+ // all: volatile op (bit 31, stripped at lowering)
[IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
@@ -1186,20 +1183,17 @@ def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[data_ty],
- [AMDGPUBufferRsrcTy, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
- llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
- llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
- // bit 1 = slc,
- // bit 2 = dlc on gfx10/gfx11),
- // swizzled buffer (bit 3 = swz),
- // gfx12+:
- // cachepolicy (bits [0-2] = th,
- // bits [3-4] = scope)
- // swizzled buffer (bit 6 = swz),
- // all:
- // volatile op (bit 31, stripped at lowering))
+ [AMDGPUBufferRsrcTy, // rsrc(SGPR)
+ llvm_i32_ty, // vindex(VGPR)
+ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
+ llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+ llvm_i32_ty], // auxiliary/cachepolicy(imm):
+ // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+ // bit 3 = swz, bit 4 = scc (gfx90a)
+ // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+ // gfx12+: bits [0-2] = th, bits [3-4] = scope,
+ // bit 6 = swz
+ // all: volatile op (bit 31, stripped at lowering)
[IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
@@ -1208,20 +1202,17 @@ def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad;
class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[],
- [data_ty, // vdata(VGPR)
- llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
- llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
- llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
- // bit 1 = slc,
- // bit 2 = dlc on gfx10/gfx11),
- // swizzled buffer (bit 3 = swz),
- // gfx12+:
- // cachepolicy (bits [0-2] = th,
- // bits [3-4] = scope)
- // swizzled buffer (bit 6 = swz),
- // all:
- // volatile op (bit 31, stripped at lowering))
+ [data_ty, // vdata(VGPR)
+ llvm_v4i32_ty, // rsrc(SGPR)
+ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
+ llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+ llvm_i32_ty], // auxiliary/cachepolicy(imm):
+ // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+ // bit 3 = swz, bit 4 = scc (gfx90a)
+ // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+ // gfx12+: bits [0-2] = th, bits [3-4] = scope,
+ // bit 6 = swz
+ // all: volatile op (bit 31, stripped at lowering)
[IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1>;
def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>;
@@ -1229,20 +1220,17 @@ def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[],
- [data_ty, // vdata(VGPR)
- AMDGPUBufferRsrcTy, // rsrc(SGPR)
- llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
- llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
- llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
- // bit 1 = slc,
- // bit 2 = dlc on gfx10/gfx11),
- // swizzled buffer (bit 3 = swz),
- // gfx12+:
- // cachepolicy (bits [0-2] = th,
- // bits [3-4] = scope)
- // swizzled buffer (bit 6 = swz),
- // all:
- // volatile op (bit 31, stripped at lowering))
+ [data_ty, // vdata(VGPR)
+ AMDGPUBufferRsrcTy, // rsrc(SGPR)
+ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
+ llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+ llvm_i32_ty], // auxiliary/cachepolicy(imm):
+ // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+ // bit 3 = swz, bit 4 = scc (gfx90a)
+ // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+ // gfx12+: bits [0-2] = th, bits [3-4] = scope,
+ // bit 6 = swz
+ // all: volatile op (bit 31, stripped at lowering)
[IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1>;
@@ -1251,21 +1239,18 @@ def int_amdgcn_raw_ptr_buffer_store : AMDGPURawPtrBufferStore;
class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[],
- [data_ty, // vdata(VGPR)
- llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
- llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
- llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
- // bit 1 = slc,
- // bit 2 = dlc on gfx10/gfx11),
- // swizzled buffer (bit 3 = swz),
- // gfx12+:
- // cachepolicy (bits [0-2] = th,
- // bits [3-4] = scope)
- // swizzled buffer (bit 6 = swz),
- // all:
- // volatile op (bit 31, stripped at lowering))
+ [data_ty, // vdata(VGPR)
+ llvm_v4i32_ty, // rsrc(SGPR)
+ llvm_i32_ty, // vindex(VGPR)
+ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
+ llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
+ llvm_i32_ty], // auxiliary/cachepolicy(imm):
+ // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
+ // bit 3 = swz, bit 4 = scc (gfx90a)
+ // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+ // gfx12+: bits [0-2] = th, bits [3-4] = scope,
+ // bit 6 = swz
+ // all: volatile op (bit 31, stripped at lowering)
[IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1>;
def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
@@ -1273,21 +1258,18 @@ def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[],
- [data_ty, // vdata(VGPR)
- AMDGPUBufferRsrcTy, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
- llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
- llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
- // bit 1 = slc,
- // bit 2 = dlc on gfx10/gfx11),
- // swizzled buffer (bit 3 = swz),
- // gfx12+:
- // cachepolicy (bits [0-2] = th,
- // bits [3-4] = scope)
- // swizzled buffer (bit 6 = swz),
- // all:
- // volatile op (bit 31, stripped at lowering))
+ [data_ty, // vdata(VGPR)
+ AMDGPUBufferRsrcTy, // rsrc(SGPR)
+ llvm_i32_ty, // vindex(VGPR)
+ llvm_i32_ty, // offset(VGPR/imm, included in bounds ch...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/87364
More information about the llvm-commits
mailing list