[llvm] [AMDGPU] Insert casts in PromoteAlloca. (PR #124547)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 27 06:14:53 PST 2025
https://github.com/jofrn created https://github.com/llvm/llvm-project/pull/124547
There may be casts between values of type <n x ptr0> and <m x ptr1> where m != n, that is, vectors with a different number of elements even though the sizes match up. This is not valid unless appropriate bitcasts are inserted in between. For example, to cast between
%in = <4 x ptr addrspace(5)> and %out = <2 x ptr addrspace(1)>, one must do:
%i128 = bitcast <4 x i32> %in to i128
%ivec = bitcast i128 to <2 x i64>
%out = inttoptr <2 x i64> to <2 x ptr addrspace(1)>
>From 37bbba8572d21f2607d5692aa8e1d039c210ea49 Mon Sep 17 00:00:00 2001
From: jofernau <Joe.Fernau at amd.com>
Date: Mon, 27 Jan 2025 06:05:24 -0800
Subject: [PATCH] [AMDGPU] Insert casts in PromoteAlloca.
There may be casts between values of type <n x ptr0> and <m x ptr1>
where m != n, that is, vectors with a different number of elements even
though the sizes match up. This is not valid unless appropriate bitcasts are
inserted in between. For example, to cast between
%in = <4 x ptr addrspace(5)> and %out = <2 x ptr addrspace(1)>, one must do:
%i128 = bitcast <4 x i32> %in to i128
%ivec = bitcast i128 to <2 x i64>
%out = inttoptr <2 x i64> to <2 x ptr addrspace(1)>
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 63 ++++++-
.../CodeGen/AMDGPU/promote-alloca-subvecs.ll | 166 ++++++++++++++++++
2 files changed, 222 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e27ef71c1c0883..0082e8f4856fea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -470,19 +470,68 @@ static Value *promoteAllocaUserToVector(
return Dummy;
};
- const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
- Type *PtrTy) -> Value * {
+ const auto CreateCastBetweenUnequalNumVecElems = [&Builder, DL, Inst]
+ (Value *Val, Type *ResultTy) -> Value * {
+ // Can already cast between vectors of integers.
+ if (isa<IntegerType>(Val->getType()->getScalarType()) &&
+ isa<IntegerType>(ResultTy->getScalarType()))
+ return Builder.CreateBitOrPointerCast(Val, ResultTy);
+
+ // Insert casts between vectors/scalars of an unequal number of elements.
+ FixedVectorType *ValVTy = dyn_cast<FixedVectorType>(Val->getType());
+ FixedVectorType *ResultVTy = dyn_cast<FixedVectorType>(ResultTy);
+ if (isa<PointerType>(Val->getType()->getScalarType())) {
+ Type *IntTy;
+ if (ValVTy) {
+ Type *IntElemTy = Builder.getIntNTy(
+ DL.getTypeAllocSizeInBits(ValVTy->getScalarType()));
+ IntTy = FixedVectorType::get(IntElemTy, ValVTy->getNumElements());
+ } else
+ IntTy = IntegerType::get(Inst->getParent()->getParent()->getContext(),
+ DL.getTypeAllocSizeInBits(Val->getType()));
+ // Insert ptrtoint if casting to <m x ptr> or if Val is a ptr.
+ const bool IsToScalar = !ResultVTy;
+ const bool IsToVector = ResultVTy->getNumElements() !=
+ ValVTy->getNumElements();
+ if (IsToScalar || IsToVector)
+ Val = Builder.CreatePtrToInt(Val, IntTy);
+ }
+
+ const bool IsScalarToVector = ResultVTy && !ValVTy;
+ const bool IsVectorToVector = ResultVTy &&
+ ValVTy->getNumElements() != ResultVTy->getNumElements();
+ if (IsScalarToVector || IsVectorToVector) {
+ Type *IntTy = Builder.getIntNTy(
+ DL.getTypeAllocSizeInBits(Val->getType()));
+ // Insert bitcast to cast from integer, iM, to vector, <m x iN>.
+ Val = Builder.CreateBitCast(Val, IntTy);
+ // If result is a ptr, insert bitcast from <m x iN> to <n x ptr>.
+ if (isa<PointerType>(ResultVTy->getScalarType())) {
+ FixedVectorType *VectorIntTy =
+ FixedVectorType::get(Builder.getIntNTy(
+ DL.getTypeAllocSizeInBits(ResultVTy->getScalarType())),
+ ResultVTy->getNumElements());
+ Val = Builder.CreateBitCast(Val, VectorIntTy);
+ }
+ }
+ return Builder.CreateBitOrPointerCast(Val, ResultTy);
+ };
+
+ const auto CreateTempPtrIntCast = [&Builder, DL,
+ CreateCastBetweenUnequalNumVecElems]
+ (Value *Val, Type *PtrTy) -> Value * {
assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
- if (!PtrTy->isVectorTy())
- return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
+ if (!PtrTy->isVectorTy()) {
+ return CreateCastBetweenUnequalNumVecElems(Val, Builder.getIntNTy(Size));
+ }
const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
// If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
// first cast the ptr vector to <2 x i64>.
assert((Size % NumPtrElts == 0) && "Vector size not divisble");
Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
- return Builder.CreateBitOrPointerCast(
- Val, FixedVectorType::get(EltTy, NumPtrElts));
+ FixedVectorType *ResultVTy = FixedVectorType::get(EltTy, NumPtrElts);
+ return CreateCastBetweenUnequalNumVecElems(Val, ResultVTy);
};
Type *VecEltTy = VectorTy->getElementType();
@@ -564,7 +613,7 @@ static Value *promoteAllocaUserToVector(
Val = CreateTempPtrIntCast(Val, AccessTy);
else if (VectorTy->isPtrOrPtrVectorTy())
Val = CreateTempPtrIntCast(Val, VectorTy);
- return Builder.CreateBitOrPointerCast(Val, VectorTy);
+ return CreateCastBetweenUnequalNumVecElems(Val, VectorTy);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index 7c5410004ed5b7..f93c6db3c2712b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -327,6 +327,172 @@ entry:
ret void
}
+define <2 x ptr addrspace(1)> @test_subvector_ptralloca_8(<2 x ptr addrspace(1)> %val) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @test_subvector_ptralloca_8
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[VAL:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[VAL]] to <2 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i128 [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr <4 x i32> [[TMP2]] to <4 x ptr addrspace(5)>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr addrspace(5)> undef, ptr addrspace(5) [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP5]], ptr addrspace(5) [[TMP6]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP7]], ptr addrspace(5) [[TMP8]], i32 2
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP9]], ptr addrspace(5) [[TMP10]], i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x ptr addrspace(5)> poison, ptr addrspace(5) [[TMP4]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x ptr addrspace(5)> [[TMP12]], ptr addrspace(5) [[TMP6]], i64 1
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x ptr addrspace(5)> [[TMP13]], ptr addrspace(5) [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x ptr addrspace(5)> [[TMP14]], ptr addrspace(5) [[TMP10]], i64 3
+; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint <4 x ptr addrspace(5)> [[TMP15]] to <4 x i32>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to i128
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i128 [[TMP17]] to <2 x i64>
+; CHECK-NEXT: [[TMP19:%.*]] = inttoptr <2 x i64> [[TMP18]] to <2 x ptr addrspace(1)>
+; CHECK-NEXT: ret <2 x ptr addrspace(1)> [[TMP19]]
+;
+entry:
+ %stack = alloca [8 x ptr addrspace(5)], align 4, addrspace(5)
+ store <2 x ptr addrspace(1)> %val, ptr addrspace(5) %stack
+ %L = load <2 x ptr addrspace(1)>, ptr addrspace(5) %stack, align 16
+ ret <2 x ptr addrspace(1)> %L
+}
+
+define <2 x ptr addrspace(1)> @test_subvector_ptralloca_4(<2 x ptr addrspace(1)> %val) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @test_subvector_ptralloca_4
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[VAL:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[VAL]] to <2 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i128 [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr <4 x i32> [[TMP2]] to <4 x ptr addrspace(5)>
+; CHECK-NEXT: ret <2 x ptr addrspace(1)> [[VAL]]
+;
+entry:
+ %stack = alloca [4 x ptr addrspace(5)], align 4, addrspace(5)
+ store <2 x ptr addrspace(1)> %val, ptr addrspace(5) %stack
+ %L = load <2 x ptr addrspace(1)>, ptr addrspace(5) %stack, align 16
+ ret <2 x ptr addrspace(1)> %L
+}
+
+define <2 x ptr addrspace(1)> @test_vector_ptralloca_2_3to1(<2 x ptr addrspace(1)> %val) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @test_vector_ptralloca_2_3to1
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[VAL:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[VAL]] to <2 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i128 [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr <4 x i32> [[TMP2]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr addrspace(3)> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr addrspace(3)> undef, ptr addrspace(3) [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(3)> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr addrspace(3)> [[TMP5]], ptr addrspace(3) [[TMP6]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x ptr addrspace(3)> poison, ptr addrspace(3) [[TMP4]], i64 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x ptr addrspace(3)> [[TMP8]], ptr addrspace(3) [[TMP6]], i64 1
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[TMP9]] to <4 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to i128
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i128 [[TMP11]] to <2 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr <2 x i64> [[TMP12]] to <2 x ptr addrspace(1)>
+; CHECK-NEXT: ret <2 x ptr addrspace(1)> [[TMP13]]
+;
+entry:
+ %stack = alloca [2 x ptr addrspace(3)], align 4, addrspace(3)
+ store <2 x ptr addrspace(1)> %val, ptr addrspace(3) %stack
+ %L = load <2 x ptr addrspace(1)>, ptr addrspace(3) %stack, align 16
+ ret <2 x ptr addrspace(1)> %L
+}
+
+define <2 x ptr addrspace(5)> @test_subvector_ptralloca_2_1to5(<2 x ptr addrspace(5)> %val) {
+; CHECK-LABEL: define <2 x ptr addrspace(5)> @test_subvector_ptralloca_2_1to5
+; CHECK-SAME: (<2 x ptr addrspace(5)> [[VAL:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(5)> [[VAL]] to <2 x i32>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr <1 x i64> [[TMP2]] to <1 x ptr addrspace(1)>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <1 x ptr addrspace(1)> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr addrspace(1)> undef, ptr addrspace(1) [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <1 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP4]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint <1 x ptr addrspace(1)> [[TMP6]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP8]] to <2 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr <2 x i32> [[TMP9]] to <2 x ptr addrspace(5)>
+; CHECK-NEXT: ret <2 x ptr addrspace(5)> [[TMP10]]
+;
+entry:
+ %stack = alloca [8 x ptr addrspace(1)], align 4, addrspace(1)
+ store <2 x ptr addrspace(5)> %val, ptr addrspace(1) %stack
+ %L = load <2 x ptr addrspace(5)>, ptr addrspace(1) %stack, align 16
+ ret <2 x ptr addrspace(5)> %L
+}
+
+define <2 x ptr addrspace(270)> @test_subvector_ptralloca_8_3to270(<2 x ptr addrspace(270)> %val) {
+; CHECK-LABEL: define <2 x ptr addrspace(270)> @test_subvector_ptralloca_8_3to270
+; CHECK-SAME: (<2 x ptr addrspace(270)> [[VAL:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(270)> [[VAL]] to <2 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i128 [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr <4 x i32> [[TMP2]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr addrspace(3)> [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr addrspace(3)> undef, ptr addrspace(3) [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(3)> [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr addrspace(3)> [[TMP5]], ptr addrspace(3) [[TMP6]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x ptr addrspace(3)> [[TMP3]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x ptr addrspace(3)> [[TMP7]], ptr addrspace(3) [[TMP8]], i32 2
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x ptr addrspace(3)> [[TMP3]], i64 3
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x ptr addrspace(3)> [[TMP9]], ptr addrspace(3) [[TMP10]], i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x ptr addrspace(3)> poison, ptr addrspace(3) [[TMP4]], i64 0
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x ptr addrspace(3)> [[TMP12]], ptr addrspace(3) [[TMP6]], i64 1
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x ptr addrspace(3)> [[TMP13]], ptr addrspace(3) [[TMP8]], i64 2
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x ptr addrspace(3)> [[TMP14]], ptr addrspace(3) [[TMP10]], i64 3
+; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[TMP15]] to <4 x i32>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to i128
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i128 [[TMP17]] to <2 x i64>
+; CHECK-NEXT: [[TMP19:%.*]] = inttoptr <2 x i64> [[TMP18]] to <2 x ptr addrspace(270)>
+; CHECK-NEXT: ret <2 x ptr addrspace(270)> [[TMP19]]
+;
+entry:
+ %stack = alloca [8 x ptr addrspace(3)], align 4, addrspace(3)
+ store <2 x ptr addrspace(270)> %val, ptr addrspace(3) %stack
+ %L = load <2 x ptr addrspace(270)>, ptr addrspace(3) %stack, align 16
+ ret <2 x ptr addrspace(270)> %L
+}
+
+define ptr @test_subvector_ptralloca_2_scalar(ptr %val) {
+; CHECK-LABEL: define ptr @test_subvector_ptralloca_2_scalar
+; CHECK-SAME: (ptr [[VAL:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[VAL]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <2 x i32> [[TMP1]] to <2 x ptr addrspace(3)>
+; CHECK-NEXT: ret ptr [[VAL]]
+;
+entry:
+ %stack = alloca <2 x ptr addrspace(3)>, align 8, addrspace(3)
+ store ptr %val, ptr addrspace(3) %stack
+ %L = load ptr, ptr addrspace(3) %stack, align 8
+ ret ptr %L
+}
+
+define ptr @test_subvector_ptralloca_1_scalar(ptr %val) {
+; CHECK-LABEL: define ptr @test_subvector_ptralloca_1_scalar
+; CHECK-SAME: (ptr [[VAL:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[STACK:%.*]] = alloca <1 x ptr addrspace(3)>, align 8, addrspace(3)
+; CHECK-NEXT: store ptr [[VAL]], ptr addrspace(3) [[STACK]], align 8
+; CHECK-NEXT: [[L:%.*]] = load ptr, ptr addrspace(3) [[STACK]], align 8
+; CHECK-NEXT: ret ptr [[L]]
+;
+entry:
+ %stack = alloca <1 x ptr addrspace(3)>, align 8, addrspace(3)
+ store ptr %val, ptr addrspace(3) %stack
+ %L = load ptr, ptr addrspace(3) %stack, align 8
+ ret ptr %L
+}
+
define void @test_out_of_bounds_subvec(<2 x i64> %val) {
; CHECK-LABEL: define void @test_out_of_bounds_subvec
; CHECK-SAME: (<2 x i64> [[VAL:%.*]]) {
More information about the llvm-commits
mailing list