[llvm] d7b4b76 - [AMDGPU] Handle memset users in PromoteAlloca
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 28 06:02:01 PDT 2023
Author: pvanhout
Date: 2023-03-28T15:01:55+02:00
New Revision: d7b4b76956e2da8b251c4e6121624477e1491211
URL: https://github.com/llvm/llvm-project/commit/d7b4b76956e2da8b251c4e6121624477e1491211
DIFF: https://github.com/llvm/llvm-project/commit/d7b4b76956e2da8b251c4e6121624477e1491211.diff
LOG: [AMDGPU] Handle memset users in PromoteAlloca
Allows allocas with memset users to be promoted.
This is intended to prevent patterns such as `memset(&alloca, 0, sizeof(alloca))` (which I think can be emitted by frontends) from preventing a vectorization of allocas.
Fixes SWDEV-388784
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D146225
Added:
llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index a7da4005e867e..2fe5fbebf7c19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
@@ -384,6 +385,19 @@ struct MemTransferInfo {
ConstantInt *DestIndex = nullptr;
};
+// Checks if the instruction I is a memset user of the alloca AI that we can
+// deal with. Currently, only non-volatile memsets that affect the whole alloca
+// are handled.
+static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
+ const DataLayout &DL) {
+ using namespace PatternMatch;
+ // For now we only care about non-volatile memsets that affect the whole type
+ // (start at index 0 and fill the whole alloca).
+ const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
+ return I->getOperand(0) == AI &&
+ match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
+}
+
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
unsigned MaxVGPRs) {
@@ -485,6 +499,12 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
continue;
}
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
+ MSI && isSupportedMemset(MSI, Alloca, DL)) {
+ WorkList.push_back(Inst);
+ continue;
+ }
+
if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
if (TransferInst->isVolatile())
return false;
@@ -609,6 +629,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign());
Inst->eraseFromParent();
+ } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+ // Ensure the length parameter of the memsets matches the new vector
+ // type's. In general, the type size shouldn't change so this is a
+ // no-op, but it's better to be safe.
+ MSI->setOperand(2, Builder.getInt64(DL.getTypeStoreSize(VectorTy)));
} else {
llvm_unreachable("Unsupported call when promoting alloca to vector");
}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
new file mode 100644
index 0000000000000..f31421de517cb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s
+
+; Checks that memsets don't block PromoteAlloca.
+
+; Note: memsets are just updated with the new type size. They are not eliminated which means
+; the original alloca also stay. This puts a bit more load on SROA.
+; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with
+; e.g. ConstantAggregate.
+
+define amdgpu_kernel void @memset_all_zero(i64 %val) {
+; CHECK-LABEL: @memset_all_zero(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %stack = alloca [6 x i64], align 4, addrspace(5)
+ call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 48, i1 false)
+ store i64 %val, ptr addrspace(5) %stack
+ %reload = load i64, ptr addrspace(5) %stack
+ %stack.1 = getelementptr [6 x i64], ptr addrspace(5) %stack, i64 0, i64 1
+ store i64 %val, ptr addrspace(5) %stack.1
+ ret void
+}
+
+define amdgpu_kernel void @memset_all_5(i64 %val) {
+; CHECK-LABEL: @memset_all_5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %stack = alloca [4 x i64], align 4, addrspace(5)
+ call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 5, i64 32, i1 false)
+ store i64 %val, ptr addrspace(5) %stack
+ %reload = load i64, ptr addrspace(5) %stack
+ %stack.1 = getelementptr [6 x i64], ptr addrspace(5) %stack, i64 0, i64 1
+ store i64 %val, ptr addrspace(5) %stack.1
+ ret void
+}
+
+define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
+; CHECK-LABEL: @memset_volatile_nopromote(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
+; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true)
+; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %stack = alloca [4 x i64], align 4, addrspace(5)
+ call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 32, i1 true)
+ store i64 %val, ptr addrspace(5) %stack
+ ret void
+}
+
+define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
+; CHECK-LABEL: @memset_badsize_nopromote(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
+; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true)
+; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %stack = alloca [4 x i64], align 4, addrspace(5)
+ call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 31, i1 true)
+ store i64 %val, ptr addrspace(5) %stack
+ ret void
+}
+
+define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
+; CHECK-LABEL: @memset_offset_ptr_nopromote(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true)
+; CHECK-NEXT: ret void
+;
+entry:
+ %stack = alloca [4 x i64], align 4, addrspace(5)
+ %gep = getelementptr [4 x i64], ptr addrspace(5) %stack, i64 0, i64 1
+ call void @llvm.memset.p5.i64(ptr addrspace(5) %gep, i8 0, i64 24, i1 true)
+ store i64 %val, ptr addrspace(5) %stack
+ ret void
+}
+
+declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)
More information about the llvm-commits
mailing list