[llvm] [AMDGPU] Override getRegUsageForType() to fix <N x ptr(7)> crash (PR #126642)
Krzysztof Drewniak via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 12 12:04:42 PST 2025
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/126642
>From 24f04c888ff4f8f073afdf25472c645c747c12a9 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 11 Feb 2025 01:08:41 +0000
Subject: [PATCH 1/4] Make TargetLowering::getValueType() virtual to fix <N x
ptr(7)> crash
Even though ptr addrspace(7) is rewritten away into more primitive
constructs before reaching SelectionDAG or GlobalISel, we stil
sometimes need to query properties like how many registers it will
require.
We already had an existing workaroung that would map ptr
addrspace(7) (and addrspace(9)) to MVT::{5,6}i32, their ultimate
in-register representations, in overloads of
TargetLowering::getPointerType(DL, AddressSpace).
However, whenever TargetLowering::getValue() tried to construct a
vector VT out of those vector MVTs, the vector constructor would
assert because you can't have a vector of vectors.
This commit solves the crash by manually overriding getValueTy() and
getMemValueType() in the AMDGPU TargetLowering.
This is something of a big hammer, and I'm open to suggestions for a
more precise change.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 9 ++---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +++++++++++++++++
llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 ++
.../AMDGPU/buffer-fat-pointer.ll | 39 +++++++++++++++++++
4 files changed, 82 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bbecc7a6ddaee..8e9e2edc3e149 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1675,8 +1675,8 @@ class TargetLoweringBase {
/// operations except for the pointer size. If AllowUnknown is true, this
/// will return MVT::Other for types with no EVT counterpart (e.g. structs),
/// otherwise it will assert.
- EVT getValueType(const DataLayout &DL, Type *Ty,
- bool AllowUnknown = false) const {
+ virtual EVT getValueType(const DataLayout &DL, Type *Ty,
+ bool AllowUnknown = false) const {
// Lower scalar pointers to native pointer types.
if (auto *PTy = dyn_cast<PointerType>(Ty))
return getPointerTy(DL, PTy->getAddressSpace());
@@ -1695,8 +1695,8 @@ class TargetLoweringBase {
return EVT::getEVT(Ty, AllowUnknown);
}
- EVT getMemValueType(const DataLayout &DL, Type *Ty,
- bool AllowUnknown = false) const {
+ virtual EVT getMemValueType(const DataLayout &DL, Type *Ty,
+ bool AllowUnknown = false) const {
// Lower scalar pointers to native pointer types.
if (auto *PTy = dyn_cast<PointerType>(Ty))
return getPointerMemTy(DL, PTy->getAddressSpace());
@@ -1714,7 +1714,6 @@ class TargetLoweringBase {
return getValueType(DL, Ty, AllowUnknown);
}
-
/// Return the MVT corresponding to this LLVM type. See getValueType.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty,
bool AllowUnknown = false) const {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b632c50dae0e3..7bb3a9f262419 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1203,6 +1203,41 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
+/// Passes like the loop vectorizer will, for example, try to query the size in
+/// registers of buffer fat pointer. They don't exist by the time we reach
+/// codegen, but these queries can still come in. Unfortunately, something like
+/// <2 x ptr addrspace(7)> will get lowered to <2 x v5i32> by the workarounds
+/// above, which causes a crash. Handle this case here.
+EVT SITargetLowering::getValueType(const DataLayout &DL, Type *Ty,
+ bool AllowUnknown) const {
+ if (auto *VT = dyn_cast<VectorType>(Ty)) {
+ if (auto *PT = dyn_cast<PointerType>(VT->getElementType())) {
+ MVT MET = getPointerTy(DL, PT->getAddressSpace());
+ if (MET.isVector() && MET.getVectorElementType() == MVT::i32) {
+ return EVT::getVectorVT(
+ Ty->getContext(), EVT(MET.getVectorElementType()),
+ VT->getElementCount() * MET.getVectorNumElements());
+ }
+ }
+ }
+ return AMDGPUTargetLowering::getValueType(DL, Ty, AllowUnknown);
+}
+
+EVT SITargetLowering::getMemValueType(const DataLayout &DL, Type *Ty,
+ bool AllowUnknown) const {
+ if (auto *VT = dyn_cast<VectorType>(Ty)) {
+ if (auto *PT = dyn_cast<PointerType>(VT->getElementType())) {
+ MVT ScalarTy = getPointerMemTy(DL, PT->getAddressSpace());
+ if (ScalarTy.isVector() && ScalarTy.getVectorElementType() == MVT::i32) {
+ return EVT::getVectorVT(
+ Ty->getContext(), EVT(ScalarTy.getVectorElementType()),
+ VT->getElementCount() * ScalarTy.getVectorNumElements());
+ }
+ }
+ }
+ return AMDGPUTargetLowering::getValueType(DL, Ty, AllowUnknown);
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1cd7f1b29e077..f355e031c5f89 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -306,6 +306,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
// so, to work around the lack of i160, map it to v5i32.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override;
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override;
+ EVT getValueType(const DataLayout &DL, Type *Ty,
+ bool AllowUnknown = false) const override;
+ EVT getMemValueType(const DataLayout &DL, Type *Ty,
+ bool AllowUnknown = false) const override;
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
MachineFunction &MF,
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
new file mode 100644
index 0000000000000..b7cf8db453dcf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=loop-vectorize -S < %s | FileCheck %s
+
+; Reduced from a crash, variables added to make things more realistic.
+; This is a roundabout test for TargetLowering::getValueType() returning
+; a reasonable value for <N x p7> instead of asserting.
+define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(ptr addrspace(1) %.ptr, i64 %0) {
+; CHECK-LABEL: define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(
+; CHECK-SAME: ptr addrspace(1) [[DOTPTR:%.*]], i64 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[_LR_PH5:.*:]]
+; CHECK-NEXT: [[DOTRSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[DOTPTR]], i16 0, i32 -2147483648, i32 159744)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(8) [[DOTRSRC]] to ptr addrspace(7)
+; CHECK-NEXT: br label %[[BB2:.*]]
+; CHECK: [[BB2]]:
+; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[DOTLR_PH5:%.*]] ], [ [[TMP5:%.*]], %[[BB2]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5]] = add i64 [[TMP3]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP3]], [[TMP0]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], [[DOT_CRIT_EDGE_LOOPEXIT:label %.*]], label %[[BB2]]
+; CHECK: [[__CRIT_EDGE_LOOPEXIT:.*:]]
+; CHECK-NEXT: ret void
+;
+.lr.ph5:
+ %.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %.ptr, i16 0, i32 2147483648, i32 159744)
+ %1 = addrspacecast ptr addrspace(8) %.rsrc to ptr addrspace(7)
+ br label %2
+
+2: ; preds = %2, %.lr.ph5
+ %3 = phi i64 [ 0, %.lr.ph5 ], [ %5, %2 ]
+ %4 = getelementptr i32, ptr addrspace(7) %1, i32 0
+ %5 = add i64 %3, 1
+ %exitcond.not = icmp eq i64 %3, %0
+ br i1 %exitcond.not, label %._crit_edge.loopexit, label %2
+
+._crit_edge.loopexit: ; preds = %2
+ ret void
+}
+
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32)
>From dbb5c4db60b70323e09af5725d009253df486076 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 11 Feb 2025 21:19:45 +0000
Subject: [PATCH 2/4] Rework PR per Matt
---
llvm/include/llvm/CodeGen/TargetLowering.h | 32 ++++++++++++-----
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 -------------------
llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 ---
.../AMDGPU/buffer-fat-pointer.ll | 24 ++++++-------
4 files changed, 36 insertions(+), 59 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8e9e2edc3e149..b9ebc60496c82 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1675,40 +1675,56 @@ class TargetLoweringBase {
/// operations except for the pointer size. If AllowUnknown is true, this
/// will return MVT::Other for types with no EVT counterpart (e.g. structs),
/// otherwise it will assert.
- virtual EVT getValueType(const DataLayout &DL, Type *Ty,
- bool AllowUnknown = false) const {
+ EVT getValueType(const DataLayout &DL, Type *Ty,
+ bool AllowUnknown = false) const {
// Lower scalar pointers to native pointer types.
if (auto *PTy = dyn_cast<PointerType>(Ty))
return getPointerTy(DL, PTy->getAddressSpace());
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
Type *EltTy = VTy->getElementType();
+ ElementCount EC = VTy->getElementCount();
// Lower vectors of pointers to native pointer types.
if (auto *PTy = dyn_cast<PointerType>(EltTy)) {
EVT PointerTy(getPointerTy(DL, PTy->getAddressSpace()));
+ // Kludge around AMDGPU's fat pointers which, while not lowered to
+ // codegen, still needed an MVT, and could only use vectors because
+ // there weren't big enough scalars. Therefore, flatten the nominal
+ // vector-of-vectors.
+ if (PointerTy.isVector()) {
+ EC = EC * PointerTy.getVectorNumElements();
+ PointerTy = PointerTy.getVectorElementType();
+ }
EltTy = PointerTy.getTypeForEVT(Ty->getContext());
}
- return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false),
- VTy->getElementCount());
+ return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false), EC);
}
return EVT::getEVT(Ty, AllowUnknown);
}
- virtual EVT getMemValueType(const DataLayout &DL, Type *Ty,
- bool AllowUnknown = false) const {
+ EVT getMemValueType(const DataLayout &DL, Type *Ty,
+ bool AllowUnknown = false) const {
// Lower scalar pointers to native pointer types.
if (auto *PTy = dyn_cast<PointerType>(Ty))
return getPointerMemTy(DL, PTy->getAddressSpace());
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
Type *EltTy = VTy->getElementType();
+ ElementCount EC = VTy->getElementCount();
if (auto *PTy = dyn_cast<PointerType>(EltTy)) {
EVT PointerTy(getPointerMemTy(DL, PTy->getAddressSpace()));
+ // Kludge around AMDGPU's fat pointers which, while not lowered to
+ // codegen, still needed an MVT, and could only use vectors because
+ // there weren't big enough scalars. Therefore, flatten the nominal
+ // vector-of-vectors.
+ if (PointerTy.isVector()) {
+ EC = EC * PointerTy.getVectorNumElements();
+ PointerTy = PointerTy.getVectorElementType();
+ }
EltTy = PointerTy.getTypeForEVT(Ty->getContext());
}
- return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false),
- VTy->getElementCount());
+ return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false), EC);
}
return getValueType(DL, Ty, AllowUnknown);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7bb3a9f262419..b632c50dae0e3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1203,41 +1203,6 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
-/// Passes like the loop vectorizer will, for example, try to query the size in
-/// registers of buffer fat pointer. They don't exist by the time we reach
-/// codegen, but these queries can still come in. Unfortunately, something like
-/// <2 x ptr addrspace(7)> will get lowered to <2 x v5i32> by the workarounds
-/// above, which causes a crash. Handle this case here.
-EVT SITargetLowering::getValueType(const DataLayout &DL, Type *Ty,
- bool AllowUnknown) const {
- if (auto *VT = dyn_cast<VectorType>(Ty)) {
- if (auto *PT = dyn_cast<PointerType>(VT->getElementType())) {
- MVT MET = getPointerTy(DL, PT->getAddressSpace());
- if (MET.isVector() && MET.getVectorElementType() == MVT::i32) {
- return EVT::getVectorVT(
- Ty->getContext(), EVT(MET.getVectorElementType()),
- VT->getElementCount() * MET.getVectorNumElements());
- }
- }
- }
- return AMDGPUTargetLowering::getValueType(DL, Ty, AllowUnknown);
-}
-
-EVT SITargetLowering::getMemValueType(const DataLayout &DL, Type *Ty,
- bool AllowUnknown) const {
- if (auto *VT = dyn_cast<VectorType>(Ty)) {
- if (auto *PT = dyn_cast<PointerType>(VT->getElementType())) {
- MVT ScalarTy = getPointerMemTy(DL, PT->getAddressSpace());
- if (ScalarTy.isVector() && ScalarTy.getVectorElementType() == MVT::i32) {
- return EVT::getVectorVT(
- Ty->getContext(), EVT(ScalarTy.getVectorElementType()),
- VT->getElementCount() * ScalarTy.getVectorNumElements());
- }
- }
- }
- return AMDGPUTargetLowering::getValueType(DL, Ty, AllowUnknown);
-}
-
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f355e031c5f89..1cd7f1b29e077 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -306,10 +306,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
// so, to work around the lack of i160, map it to v5i32.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override;
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override;
- EVT getValueType(const DataLayout &DL, Type *Ty,
- bool AllowUnknown = false) const override;
- EVT getMemValueType(const DataLayout &DL, Type *Ty,
- bool AllowUnknown = false) const override;
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
MachineFunction &MF,
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
index b7cf8db453dcf..3abbe13483e03 100644
--- a/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
@@ -4,7 +4,7 @@
; Reduced from a crash, variables added to make things more realistic.
; This is a roundabout test for TargetLowering::getValueType() returning
; a reasonable value for <N x p7> instead of asserting.
-define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(ptr addrspace(1) %.ptr, i64 %0) {
+define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(ptr addrspace(1) %.ptr, i64 %v) {
; CHECK-LABEL: define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(
; CHECK-SAME: ptr addrspace(1) [[DOTPTR:%.*]], i64 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[_LR_PH5:.*:]]
@@ -20,19 +20,19 @@ define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(ptr addrspac
; CHECK: [[__CRIT_EDGE_LOOPEXIT:.*:]]
; CHECK-NEXT: ret void
;
-.lr.ph5:
- %.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %.ptr, i16 0, i32 2147483648, i32 159744)
- %1 = addrspacecast ptr addrspace(8) %.rsrc to ptr addrspace(7)
- br label %2
+entry:
+ %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %.ptr, i16 0, i32 2147483648, i32 159744)
+ %fat = addrspacecast ptr addrspace(8) %rsrc to ptr addrspace(7)
+ br label %loop
-2: ; preds = %2, %.lr.ph5
- %3 = phi i64 [ 0, %.lr.ph5 ], [ %5, %2 ]
- %4 = getelementptr i32, ptr addrspace(7) %1, i32 0
- %5 = add i64 %3, 1
- %exitcond.not = icmp eq i64 %3, %0
- br i1 %exitcond.not, label %._crit_edge.loopexit, label %2
+loop: ; preds = %loop, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %ptr = getelementptr i32, ptr addrspace(7) %fat, i32 0
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv, %v
+ br i1 %exitcond.not, label %exit, label %loop
-._crit_edge.loopexit: ; preds = %2
+exit: ; preds = %exit
ret void
}
>From 7303934715b539dec745a24617c91149b92dd6de Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Wed, 12 Feb 2025 19:31:21 +0000
Subject: [PATCH 3/4] Undo version 2 of the change
---
llvm/include/llvm/CodeGen/TargetLowering.h | 25 +++++-----------------
1 file changed, 5 insertions(+), 20 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index b9ebc60496c82..bbecc7a6ddaee 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1683,21 +1683,13 @@ class TargetLoweringBase {
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
Type *EltTy = VTy->getElementType();
- ElementCount EC = VTy->getElementCount();
// Lower vectors of pointers to native pointer types.
if (auto *PTy = dyn_cast<PointerType>(EltTy)) {
EVT PointerTy(getPointerTy(DL, PTy->getAddressSpace()));
- // Kludge around AMDGPU's fat pointers which, while not lowered to
- // codegen, still needed an MVT, and could only use vectors because
- // there weren't big enough scalars. Therefore, flatten the nominal
- // vector-of-vectors.
- if (PointerTy.isVector()) {
- EC = EC * PointerTy.getVectorNumElements();
- PointerTy = PointerTy.getVectorElementType();
- }
EltTy = PointerTy.getTypeForEVT(Ty->getContext());
}
- return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false), EC);
+ return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false),
+ VTy->getElementCount());
}
return EVT::getEVT(Ty, AllowUnknown);
@@ -1711,25 +1703,18 @@ class TargetLoweringBase {
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
Type *EltTy = VTy->getElementType();
- ElementCount EC = VTy->getElementCount();
if (auto *PTy = dyn_cast<PointerType>(EltTy)) {
EVT PointerTy(getPointerMemTy(DL, PTy->getAddressSpace()));
- // Kludge around AMDGPU's fat pointers which, while not lowered to
- // codegen, still needed an MVT, and could only use vectors because
- // there weren't big enough scalars. Therefore, flatten the nominal
- // vector-of-vectors.
- if (PointerTy.isVector()) {
- EC = EC * PointerTy.getVectorNumElements();
- PointerTy = PointerTy.getVectorElementType();
- }
EltTy = PointerTy.getTypeForEVT(Ty->getContext());
}
- return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false), EC);
+ return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false),
+ VTy->getElementCount());
}
return getValueType(DL, Ty, AllowUnknown);
}
+
/// Return the MVT corresponding to this LLVM type. See getValueType.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty,
bool AllowUnknown = false) const {
>From fa727bc3bbe89e1c1e1c5b7248baa909d57cda31 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Wed, 12 Feb 2025 20:04:29 +0000
Subject: [PATCH 4/4] Rework the patch, v3
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 18 ++++++++++++++++++
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 8 ++++++++
2 files changed, 26 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09f7877b13b3a..f37f8d991160d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -313,6 +313,24 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
return !F || !ST->isSingleLaneExecution(*F);
}
+unsigned GCNTTIImpl::getRegUsageForType(Type *Ty) {
+ if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+ if (auto *PT = dyn_cast<PointerType>(VT->getElementType())) {
+ switch (PT->getAddressSpace()) {
+ // Assume that the resource parts of the vector being asked about are the
+ // same.
+ case AMDGPUAS::BUFFER_FAT_POINTER:
+ return 4 + VT->getNumElements();
+ case AMDGPUAS::BUFFER_STRIDED_POINTER:
+ return 4 + 2 * VT->getNumElements();
+ default:
+ break;
+ }
+ }
+ }
+ return BaseT::getRegUsageForType(Ty);
+}
+
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
// registers. See getRegisterClassForType for the implementation.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index a0d62008d9ddc..a8fce0a78e565 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -113,6 +113,14 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP);
+ // Vectorization will query for the number of registers needed for
+ // <N x ptr addrspace(7/9)> and the default implementation will cause crashes,
+ // so override it here. This also lets us account for the fact that, in the
+ // context of loop vectorization (which is what uses this API), the number of
+ // registers needed for fat pointers is lower because they'll share a resource
+ // part.
+ unsigned getRegUsageForType(Type *Ty);
+
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
return TTI::PSK_FastHardware;
More information about the llvm-commits
mailing list