[llvm] [AMDGPU] Split wide integer dpp8 intrinsic calls (PR #113500)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 28 12:30:47 PDT 2024
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/113500
>From f3e9035a670986b0ab47bc146520a8dc4168b3eb Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 23 Oct 2024 14:54:52 -0700
Subject: [PATCH 1/2] [AMDGPU] Split wide integer dpp8 intrinsic calls
The int_amdgcn_mov_dpp8 is declared with llvm_anyint_ty, but we can only
select i32. To allow a corresponding builtin to be overloaded the same
way as int_amdgcn_mov_dpp we need it to be able to split unsupported
i64 values.
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 35 +++++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll | 33 +++++++++++++++++
2 files changed, 68 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index c49aab823b44a4..4e25f8c9464918 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -317,6 +317,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
bool visitMinNum(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
+ bool visitMovDppIntrinsic(IntrinsicInst &I);
bool run(Function &F);
};
@@ -2099,6 +2100,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitMinNum(I);
case Intrinsic::sqrt:
return visitSqrt(I);
+ case Intrinsic::amdgcn_mov_dpp8:
+ return visitMovDppIntrinsic(I);
default:
return false;
}
@@ -2257,6 +2260,38 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
return true;
}
+// Split unsupported wide integer calls.
+bool AMDGPUCodeGenPrepareImpl::visitMovDppIntrinsic(IntrinsicInst &I) {
+ Type *SrcTy = I.getType();
+ assert(SrcTy->isIntegerTy());
+ unsigned Size = SrcTy->getPrimitiveSizeInBits();
+ assert(Size % 32 == 0);
+ if (Size <= 32)
+ return false;
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+ unsigned NumElt = Size / 32;
+ IntegerType *EltTy = Builder.getInt32Ty();
+ Type *VecTy = VectorType::get(EltTy, NumElt, false);
+ Value *Vec = Builder.CreateBitCast(I.getArgOperand(0), VecTy);
+
+ unsigned IID = I.getIntrinsicID();
+ SmallVector<Value *, 6> Args(I.args());
+ SmallVector<Value *, 4> Elts;
+ for (unsigned N = 0; N != NumElt; ++N) {
+ Args[0] = Builder.CreateExtractElement(Vec, N);
+ Elts.push_back(Builder.CreateIntrinsic(EltTy, IID, Args));
+ }
+
+ Value *DppVec = insertValues(Builder, VecTy, Elts);
+ Value *NewVal = Builder.CreateBitCast(DppVec, SrcTy);
+ NewVal->takeName(&I);
+ I.replaceAllUsesWith(NewVal);
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Impl.Mod = &M;
Impl.DL = &Impl.Mod->getDataLayout();
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index 8bff17b7299270..35aac8533aa153 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
@@ -24,6 +24,39 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
ret void
}
+; GFX10PLUS-LABEL: {{^}}dpp8_i64:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
+define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1) #0
+ store i64 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_i128:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off
+define amdgpu_ps void @dpp8_i128(i128 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i128 @llvm.amdgcn.mov.dpp8.i128(i128 %in, i32 1) #0
+ store i128 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_i96:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
+define amdgpu_ps void @dpp8_i96(i96 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i96 @llvm.amdgcn.mov.dpp8.i96(i96 %in, i32 1) #0
+ store i96 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0
attributes #0 = { nounwind readnone convergent }
>From c35107b598f8f869df0447631d5ce8b43f660fc6 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 28 Oct 2024 12:29:58 -0700
Subject: [PATCH 2/2] Exit on sub-dword size before the assert it is divisable
by 32.
---
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 4e25f8c9464918..2432cac95b5886 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -2265,9 +2265,9 @@ bool AMDGPUCodeGenPrepareImpl::visitMovDppIntrinsic(IntrinsicInst &I) {
Type *SrcTy = I.getType();
assert(SrcTy->isIntegerTy());
unsigned Size = SrcTy->getPrimitiveSizeInBits();
- assert(Size % 32 == 0);
if (Size <= 32)
return false;
+ assert(Size % 32 == 0);
IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
More information about the llvm-commits
mailing list