[llvm] e94125f - [DSE] Add support for not aligned begin/end
Evgeniy Brevnov via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 3 21:24:37 PST 2021
Author: Evgeniy Brevnov
Date: 2021-03-04T12:24:23+07:00
New Revision: e94125f054310ea4a6559f9b2fa3f1896c1ba48a
URL: https://github.com/llvm/llvm-project/commit/e94125f054310ea4a6559f9b2fa3f1896c1ba48a
DIFF: https://github.com/llvm/llvm-project/commit/e94125f054310ea4a6559f9b2fa3f1896c1ba48a.diff
LOG: [DSE] Add support for not aligned begin/end
This is an attempt to improve handling of partial overlaps in case of unaligned begin\end.
Existing implementation just bails out if it encounters such cases. Even when it doesn't I believe existing code checking alignment constraints is not quite correct. It tries to ensure alignment of the "later" start/end offset while should be preserving relative alignment between earlier and later start/end.
The idea behind the change is simple. When start/end is not aligned as we wish instead of bailing out let's adjust it as necessary to get desired alignment.
I'll update with performance results as measured by the test-suite...it's still running...
Reviewed By: fhahn
Differential Revision: https://reviews.llvm.org/D93530
Added:
Modified:
llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
llvm/test/Transforms/DeadStoreElimination/MemDepAnalysis/OverwriteStoreBegin.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 04467f4306e7..2eeae7f1fb08 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1067,58 +1067,94 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
return MadeChange;
}
-static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
- uint64_t &EarlierSize, int64_t LaterOffset,
+static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart,
+ uint64_t &EarlierSize, int64_t LaterStart,
uint64_t LaterSize, bool IsOverwriteEnd) {
- // TODO: base this on the target vector size so that if the earlier
- // store was too small to get vector writes anyway then its likely
- // a good idea to shorten it
- // Power of 2 vector writes are probably always a bad idea to optimize
- // as any store/memset/memcpy is likely using vector instructions so
- // shortening it to not vector size is likely to be slower
auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
- unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
- if (!IsOverwriteEnd)
- LaterOffset = int64_t(LaterOffset + LaterSize);
-
- if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
- !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
- return false;
+ Align PrefAlign = EarlierIntrinsic->getDestAlign().valueOrOne();
+
+ // We assume that memet/memcpy operates in chunks of the "largest" native
+ // type size and aligned on the same value. That means optimal start and size
+ // of memset/memcpy should be modulo of preferred alignment of that type. That
+ // is it there is no any sense in trying to reduce store size any further
+ // since any "extra" stores comes for free anyway.
+ // On the other hand, maximum alignment we can achieve is limited by alignment
+ // of initial store.
+
+ // TODO: Limit maximum alignment by preferred (or abi?) alignment of the
+ // "largest" native type.
+ // Note: What is the proper way to get that value?
+ // Should TargetTransformInfo::getRegisterBitWidth be used or anything else?
+ // PrefAlign = std::min(DL.getPrefTypeAlign(LargestType), PrefAlign);
+
+ int64_t ToRemoveStart = 0;
+ uint64_t ToRemoveSize = 0;
+ // Compute start and size of the region to remove. Make sure 'PrefAlign' is
+ // maintained on the remaining store.
+ if (IsOverwriteEnd) {
+ // Calculate required adjustment for 'LaterStart'in order to keep remaining
+ // store size aligned on 'PerfAlign'.
+ uint64_t Off =
+ offsetToAlignment(uint64_t(LaterStart - EarlierStart), PrefAlign);
+ ToRemoveStart = LaterStart + Off;
+ if (EarlierSize <= uint64_t(ToRemoveStart - EarlierStart))
+ return false;
+ ToRemoveSize = EarlierSize - uint64_t(ToRemoveStart - EarlierStart);
+ } else {
+ ToRemoveStart = EarlierStart;
+ assert(LaterSize >= uint64_t(EarlierStart - LaterStart) &&
+ "Not overlapping accesses?");
+ ToRemoveSize = LaterSize - uint64_t(EarlierStart - LaterStart);
+ // Calculate required adjustment for 'ToRemoveSize'in order to keep
+ // start of the remaining store aligned on 'PerfAlign'.
+ uint64_t Off = offsetToAlignment(ToRemoveSize, PrefAlign);
+ if (Off != 0) {
+ if (ToRemoveSize <= (PrefAlign.value() - Off))
+ return false;
+ ToRemoveSize -= PrefAlign.value() - Off;
+ }
+ assert(isAligned(PrefAlign, ToRemoveSize) &&
+ "Should preserve selected alignment");
+ }
- int64_t NewLength = IsOverwriteEnd
- ? LaterOffset - EarlierOffset
- : EarlierSize - (LaterOffset - EarlierOffset);
+ assert(ToRemoveSize > 0 && "Shouldn't reach here if nothing to remove");
+ assert(EarlierSize > ToRemoveSize && "Can't remove more than original size");
+ uint64_t NewSize = EarlierSize - ToRemoveSize;
if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
// When shortening an atomic memory intrinsic, the newly shortened
// length must remain an integer multiple of the element size.
const uint32_t ElementSize = AMI->getElementSizeInBytes();
- if (0 != NewLength % ElementSize)
+ if (0 != NewSize % ElementSize)
return false;
}
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
<< (IsOverwriteEnd ? "END" : "BEGIN") << ": "
- << *EarlierWrite << "\n KILLER (offset " << LaterOffset
- << ", " << EarlierSize << ")\n");
+ << *EarlierWrite << "\n KILLER [" << ToRemoveStart << ", "
+ << int64_t(ToRemoveStart + ToRemoveSize) << ")\n");
Value *EarlierWriteLength = EarlierIntrinsic->getLength();
Value *TrimmedLength =
- ConstantInt::get(EarlierWriteLength->getType(), NewLength);
+ ConstantInt::get(EarlierWriteLength->getType(), NewSize);
EarlierIntrinsic->setLength(TrimmedLength);
+ EarlierIntrinsic->setDestAlignment(PrefAlign);
- EarlierSize = NewLength;
if (!IsOverwriteEnd) {
- int64_t OffsetMoved = (LaterOffset - EarlierOffset);
Value *Indices[1] = {
- ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
+ ConstantInt::get(EarlierWriteLength->getType(), ToRemoveSize)};
GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
EarlierIntrinsic->getRawDest()->getType()->getPointerElementType(),
EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc());
EarlierIntrinsic->setDest(NewDestGEP);
- EarlierOffset = EarlierOffset + OffsetMoved;
}
+
+ // Finally update start and size of earlier access.
+ if (!IsOverwriteEnd)
+ EarlierStart += ToRemoveSize;
+ EarlierSize = NewSize;
+
return true;
}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
index b5d9c40cbdbc..2e53a5693b29 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
@@ -234,12 +234,13 @@ entry:
ret void
}
-define void @dontwrite2to9(i32* nocapture %p) {
-; CHECK-LABEL: @dontwrite2to9(
+define void @write2to10(i32* nocapture %p) {
+; CHECK-LABEL: @write2to10(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i1 false)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i1 false)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16*
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1
; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64*
@@ -257,12 +258,13 @@ entry:
ret void
}
-define void @dontwrite2to9_atomic(i32* nocapture %p) {
-; CHECK-LABEL: @dontwrite2to9_atomic(
+define void @write2to10_atomic(i32* nocapture %p) {
+; CHECK-LABEL: @write2to10_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i32 4)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16*
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1
; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64*
@@ -391,3 +393,55 @@ entry:
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind
+define void @ow_begin_align1(i8* nocapture %p) {
+; CHECK-LABEL: @ow_begin_align1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 7
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP0]], i8 0, i64 25, i1 false)
+; CHECK-NEXT: [[P2:%.*]] = bitcast i8* [[P]] to i64*
+; CHECK-NEXT: store i64 1, i64* [[P2]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %p1 = getelementptr inbounds i8, i8* %p, i64 1
+ call void @llvm.memset.p0i8.i64(i8* align 1 %p1, i8 0, i64 32, i1 false)
+ %p2 = bitcast i8* %p to i64*
+ store i64 1, i64* %p2, align 1
+ ret void
+}
+
+define void @ow_end_align4(i8* nocapture %p) {
+; CHECK-LABEL: @ow_end_align4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: [[P2:%.*]] = bitcast i8* [[P]] to i64*
+; CHECK-NEXT: store i64 1, i64* [[P2]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %p1 = getelementptr inbounds i8, i8* %p, i64 1
+ call void @llvm.memset.p0i8.i64(i8* align 4 %p1, i8 0, i64 32, i1 false)
+ %p2 = bitcast i8* %p to i64*
+ store i64 1, i64* %p2, align 1
+ ret void
+}
+
+define void @ow_end_align8(i8* nocapture %p) {
+; CHECK-LABEL: @ow_end_align8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[P1]], i8 0, i64 32, i1 false)
+; CHECK-NEXT: [[P2:%.*]] = bitcast i8* [[P]] to i64*
+; CHECK-NEXT: store i64 1, i64* [[P2]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %p1 = getelementptr inbounds i8, i8* %p, i64 1
+ call void @llvm.memset.p0i8.i64(i8* align 8 %p1, i8 0, i64 32, i1 false)
+ %p2 = bitcast i8* %p to i64*
+ store i64 1, i64* %p2, align 1
+ ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
index b6ae657d17e5..c3f377e085a2 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
@@ -388,3 +388,61 @@ entry:
store i64 3, i64* %base64_3, align 8
ret void
}
+
+define void @ow_end_align1(i8* nocapture %p) {
+; CHECK-LABEL: @ow_end_align1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[P1]], i8 0, i64 27, i1 false)
+; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 27
+; CHECK-NEXT: [[P2_I64:%.*]] = bitcast i8* [[P2]] to i64*
+; CHECK-NEXT: store i64 1, i64* [[P2_I64]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %p1 = getelementptr inbounds i8, i8* %p, i64 1
+ call void @llvm.memset.p0i8.i64(i8* align 1 %p1, i8 0, i64 32, i1 false)
+ %p2 = getelementptr inbounds i8, i8* %p1, i64 27
+ %p2.i64 = bitcast i8* %p2 to i64*
+ store i64 1, i64* %p2.i64, align 1
+ ret void
+}
+
+define void @ow_end_align4(i8* nocapture %p) {
+; CHECK-LABEL: @ow_end_align4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P1]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 27
+; CHECK-NEXT: [[P2_I64:%.*]] = bitcast i8* [[P2]] to i64*
+; CHECK-NEXT: store i64 1, i64* [[P2_I64]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %p1 = getelementptr inbounds i8, i8* %p, i64 1
+ call void @llvm.memset.p0i8.i64(i8* align 4 %p1, i8 0, i64 32, i1 false)
+ %p2 = getelementptr inbounds i8, i8* %p1, i64 27
+ %p2.i64 = bitcast i8* %p2 to i64*
+ store i64 1, i64* %p2.i64, align 1
+ ret void
+}
+
+define void @ow_end_align8(i8* nocapture %p) {
+; CHECK-LABEL: @ow_end_align8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[P1]], i8 0, i64 32, i1 false)
+; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 27
+; CHECK-NEXT: [[P2_I64:%.*]] = bitcast i8* [[P2]] to i64*
+; CHECK-NEXT: store i64 1, i64* [[P2_I64]], align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ %p1 = getelementptr inbounds i8, i8* %p, i64 1
+ call void @llvm.memset.p0i8.i64(i8* align 8 %p1, i8 0, i64 32, i1 false)
+ %p2 = getelementptr inbounds i8, i8* %p1, i64 27
+ %p2.i64 = bitcast i8* %p2 to i64*
+ store i64 1, i64* %p2.i64, align 1
+ ret void
+}
+
diff --git a/llvm/test/Transforms/DeadStoreElimination/MemDepAnalysis/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/MemDepAnalysis/OverwriteStoreBegin.ll
index 56334be18291..93279b8eb37e 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MemDepAnalysis/OverwriteStoreBegin.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MemDepAnalysis/OverwriteStoreBegin.ll
@@ -234,12 +234,13 @@ entry:
ret void
}
-define void @dontwrite2to9(i32* nocapture %p) {
-; CHECK-LABEL: @dontwrite2to9(
+define void @write2to10(i32* nocapture %p) {
+; CHECK-LABEL: @write2to10(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i1 false)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i1 false)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16*
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1
; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64*
@@ -257,12 +258,13 @@ entry:
ret void
}
-define void @dontwrite2to9_atomic(i32* nocapture %p) {
-; CHECK-LABEL: @dontwrite2to9_atomic(
+define void @write2to10_atomic(i32* nocapture %p) {
+; CHECK-LABEL: @write2to10_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i32 4)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16*
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1
; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64*
More information about the llvm-commits
mailing list