[llvm] [AMDGPU] Add custom MachineValueType entries for buffer fat poiners (PR #127692)

Tue Feb 18 12:14:23 PST 2025

https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/127692

The old hack of returning v5/v6i32 for the fat and strided buffer pointers was causing issuse during vectorization queries that expected to be able to construct a VectorType from the return value of `MVT getPointerType()`. On example is in the test attached to this PR, which used to crash.

Now, we define the custom MVT entries, the 160-bit amdgpuBufferFatPointer and 192-bit amdgpuBufferStridedPointer, which are used to represent ptr addrspace(7) and ptr addrspace(9) respectively.

Neither of thse types will de present at the time of lowering to a SelectionDAG or other MIR - MVT::amdgpuBufferFatPointer is eliminated by the LowerBufferFatPointers pass and amdgpu::bufferStridedPointer is not currently used outside of the SPIR-V translator (which does its own lowering).

>From 70cc44e5aa4963d3a1a1d1d4fa38a85470a27863 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 18 Feb 2025 20:09:49 +0000
Subject: [PATCH] [AMDGPU] Add custom MachineValueType entries for buffer fat
 poiners

The old hack of returning v5/v6i32 for the fat and strided buffer
pointers was causing issuse during vectorization queries that expected
to be able to construct a VectorType from the return value of
`MVT getPointerType()`. On example is in the test attached to this PR,
which used to crash.

Now, we define the custom MVT entries, the 160-bit
amdgpuBufferFatPointer and 192-bit amdgpuBufferStridedPointer, which
are used to represent ptr addrspace(7) and ptr addrspace(9)
respectively.

Neither of thse types will de present at the time of lowering to a
SelectionDAG or other MIR - MVT::amdgpuBufferFatPointer is eliminated
by the LowerBufferFatPointers pass and amdgpu::bufferStridedPointer is
not currently used outside of the SPIR-V translator (which does its
own lowering).
---
 llvm/include/llvm/CodeGen/ValueTypes.td       |  4 ++
 llvm/lib/CodeGen/ValueTypes.cpp               |  6 +++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 19 ++++-----
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |  6 ++-
 .../AMDGPU/buffer-fat-pointer.ll              | 39 +++++++++++++++++++
 5 files changed, 63 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 6d6b92958b432..c427aa95cf1cd 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -331,6 +331,10 @@ def i64x8     : ValueType<512,  231>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
               : ValueType<16,  232>;  // AArch64 predicate-as-counter
 def spirvbuiltin : ValueType<0, 233>; // SPIR-V's builtin type
+// AMDGPU buffer fat pointer, buffer rsrc + offset, rewritten before MIR translation
+def amdgpuBufferFatPointer : ValueType<160, 234>;
+// AMDGPU buffer strided pointer, buffer rsrc + index + offset, doesn't reach MIR
+def amdgpuBufferStridedPointer : ValueType<192, 235>;
 
 let isNormalValueType = false in {
 def token      : ValueType<0, 504>;  // TokenTy
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 2c80eee7c904c..0554b6387c5e6 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -194,6 +194,10 @@ std::string EVT::getEVTString() const {
     return "aarch64svcount";
   case MVT::spirvbuiltin:
     return "spirvbuiltin";
+  case MVT::amdgpuBufferFatPointer:
+    return "amdgpuBufferFatPointer";
+  case MVT::amdgpuBufferStridedPointer:
+    return "amdgpuBufferStridedPointer";
   }
 }
 
@@ -219,6 +223,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return TargetExtType::get(Context, "aarch64.svcount");
   case MVT::x86amx:  return Type::getX86_AMXTy(Context);
   case MVT::i64x8:   return IntegerType::get(Context, 512);
+  case MVT::amdgpuBufferFatPointer:  return IntegerType::get(Context, 160);
+  case MVT::amdgpuBufferStridedPointer:  return IntegerType::get(Context, 192);
   case MVT::externref: return Type::getWasm_ExternrefTy(Context);
   case MVT::funcref: return Type::getWasm_FuncrefTy(Context);
   case MVT::Metadata: return Type::getMetadataTy(Context);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e09b310d107ac..6bee33b1c5168 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1175,19 +1175,20 @@ static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
   return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
 }
 
-/// Map address space 7 to MVT::v5i32 because that's its in-memory
-/// representation. This return value is vector-typed because there is no
-/// MVT::i160 and it is not clear if one can be added. While this could
-/// cause issues during codegen, these address space 7 pointers will be
-/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
-/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
-/// modeling, to work.
+/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
+/// in-memory representation. This return value is a custom type because there
+/// is no MVT::i160 and adding one breaks integer promotion logic. While this
+/// could cause issues during codegen, these address space 7 pointers will be
+/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
+/// in order to allow pre-codegen passes that query TargetTransformInfo, often
+/// for cost modeling, to work. (This also sets us up decently for doing the
+/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
 MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
   if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
-    return MVT::v5i32;
+    return MVT::amdgpuBufferFatPointer;
   if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
       DL.getPointerSizeInBits(AS) == 192)
-    return MVT::v6i32;
+    return MVT::amdgpuBufferStridedPointer;
   return AMDGPUTargetLowering::getPointerTy(DL, AS);
 }
 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1cd7f1b29e077..fedea14af97a5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -302,8 +302,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
 
   // While address space 7 should never make it to codegen, it still needs to
-  // have a MVT to prevent some analyses that query this function from breaking,
-  // so, to work around the lack of i160, map it to v5i32.
+  // have a MVT to prevent some analyses that query this function from breaking.
+  // We use the custum MVT::amdgpuBufferFatPointer and
+  // amdgpu::amdgpuBufferStridedPointer for this, though we use v8i32 for the
+  // momyr type (which is probably unused).
   MVT getPointerTy(const DataLayout &DL, unsigned AS) const override;
   MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
new file mode 100644
index 0000000000000..3abbe13483e03
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=loop-vectorize -S < %s | FileCheck %s
+
+; Reduced from a crash, variables added to make things more realistic.
+; This is a roundabout test for TargetLowering::getValueType() returning
+; a reasonable value for <N x p7> instead of asserting.
+define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(ptr addrspace(1) %.ptr, i64 %v) {
+; CHECK-LABEL: define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(
+; CHECK-SAME: ptr addrspace(1) [[DOTPTR:%.*]], i64 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[_LR_PH5:.*:]]
+; CHECK-NEXT:    [[DOTRSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[DOTPTR]], i16 0, i32 -2147483648, i32 159744)
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(8) [[DOTRSRC]] to ptr addrspace(7)
+; CHECK-NEXT:    br label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[DOTLR_PH5:%.*]] ], [ [[TMP5:%.*]], %[[BB2]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], [[DOT_CRIT_EDGE_LOOPEXIT:label %.*]], label %[[BB2]]
+; CHECK:       [[__CRIT_EDGE_LOOPEXIT:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %.ptr, i16 0, i32 2147483648, i32 159744)
+  %fat = addrspacecast ptr addrspace(8) %rsrc to ptr addrspace(7)
+  br label %loop
+
+loop:                                                ; preds = %loop, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %ptr = getelementptr i32, ptr addrspace(7) %fat, i32 0
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv, %v
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:                             ; preds = %exit
+  ret void
+}
+
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32)