[llvm] 9e65929 - [DSE] Re-enable calloc transformation with extra care (PR25892)

Dawid Jurczak via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 10 12:52:34 PDT 2021


Author: Dawid Jurczak
Date: 2021-10-10T21:47:14+02:00
New Revision: 9e65929a8e2c9d1d6fd9cd4cefaab58420387b2f

URL: https://github.com/llvm/llvm-project/commit/9e65929a8e2c9d1d6fd9cd4cefaab58420387b2f
DIFF: https://github.com/llvm/llvm-project/commit/9e65929a8e2c9d1d6fd9cd4cefaab58420387b2f.diff

LOG: [DSE] Re-enable calloc transformation with extra care (PR25892)

Transformation from malloc+memset to calloc is always correct and in many situations
it brings significant observable benefits in terms of execution speed and memory consumption [1][2].
Unfortunately there are cases when producing calloc cause performance drops [3].
As discussed here: https://reviews.llvm.org/D103009 it's possible to differentiate between those 2 scenarios.
If optimizer is able to prove that after malloc call it's _very_ likely to reach memset branch then after
calloc emission we shouldn't observe any performance hits. Therefore finding "null pointer check" pattern
before memset basic block sounds like good justification for performing transformation.
Also that method was already suggested by GCC folks [4]. Main reason for change is that for now
to be safe we check for post dominance relation which is way too conservative approach making transformation
"almost" disabled in practice. This patch tends to enable transformation again but with extra care.

[1] https://stackoverflow.com/questions/2688466/why-mallocmemset-is-slower-than-calloc
[2] https://vorpus.org/blog/why-does-calloc-exist/
[3] http://smalldatum.blogspot.com/2017/11/a-new-optimization-in-gcc-5x-and-mysql.html
[4] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83022

Differential Revision: https://reviews.llvm.org/D110021

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
    llvm/test/Transforms/DeadStoreElimination/noop-stores.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 769ca03e01c3b..543aca113e9c2 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1844,8 +1844,29 @@ struct DSEState {
           if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
               Func != LibFunc_malloc)
             return false;
+
+          auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) {
+            // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end
+            // of malloc block
+            auto *MallocBB = Malloc->getParent(),
+                 *MemsetBB = Memset->getParent();
+            if (MallocBB == MemsetBB)
+              return true;
+            auto *Ptr = Memset->getArgOperand(0);
+            auto *TI = MallocBB->getTerminator();
+            ICmpInst::Predicate Pred;
+            BasicBlock *TrueBB, *FalseBB;
+            if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
+                                FalseBB)))
+              return false;
+            if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
+              return false;
+            return true;
+          };
+
           if (Malloc->getOperand(0) == MemSet->getLength()) {
-            if (DT.dominates(Malloc, MemSet) && PDT.dominates(MemSet, Malloc) &&
+            if (shouldCreateCalloc(Malloc, MemSet) &&
+                DT.dominates(Malloc, MemSet) &&
                 memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) {
               IRBuilder<> IRB(Malloc);
               const auto &DL = Malloc->getModule()->getDataLayout();

diff  --git a/llvm/test/Transforms/DeadStoreElimination/noop-stores.ll b/llvm/test/Transforms/DeadStoreElimination/noop-stores.ll
index 38072c159bfe3..7a8d09b049d26 100644
--- a/llvm/test/Transforms/DeadStoreElimination/noop-stores.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/noop-stores.ll
@@ -405,17 +405,14 @@ if.end:                                           ; preds = %if.then, %entry
   ret i8* %call
 }
 
-; FIXME: malloc+memset are not currently transformed into calloc unless the
-; memset post-dominates the malloc.
 define float* @pr25892(i64 %size) {
 ; CHECK-LABEL: @pr25892(
 ; CHECK:       entry:
-; CHECK-NEXT:    [[CALL:%.*]] = call i8* @malloc(i64 [[SIZE:%.*]])
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @calloc(i64 1, i64 [[SIZE:%.*]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[CALL]], null
 ; CHECK-NEXT:    br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[BC:%.*]] = bitcast i8* [[CALL]] to float*
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* %call, i8 0, i64 %size, i1 false)
 ; CHECK-NEXT:    br label [[CLEANUP]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi float* [ [[BC]], [[IF_END]] ], [ null, [[ENTRY:%.*]] ]
@@ -437,13 +434,11 @@ cleanup:
 define float* @pr25892_with_extra_store(i64 %size) {
 ; CHECK-LABEL: @pr25892_with_extra_store(
 ; CHECK:       entry:
-; CHECK-NEXT:    [[CALL:%.*]] = call i8* @malloc(i64 [[SIZE:%.*]])
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @calloc(i64 1, i64 [[SIZE:%.*]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[CALL]], null
 ; CHECK-NEXT:    br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[BC:%.*]] = bitcast i8* [[CALL]] to float*
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* %call, i8 0, i64 %size, i1 false)
-; CHECK-NEXT:    store i8 0, i8* %call, align 1
 ; CHECK-NEXT:    br label [[CLEANUP]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi float* [ [[BC]], [[IF_END]] ], [ null, [[ENTRY:%.*]] ]
@@ -463,6 +458,32 @@ cleanup:
   ret float* %retval.0
 }
 
+; This should not create a calloc
+define i8* @malloc_with_no_nointer_null_check(i64 %0, i32 %1) {
+; CHECK-LABEL: @malloc_with_no_nointer_null_check
+; CHECK:       entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @malloc(i64 [[TMP0:%.*]])
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[TMP1:%.*]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALL]], i8 0, i64 [[TMP0]], i1 false)
+; CHECK-NEXT:    br label [[CLEANUP]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    ret i8* [[CALL]]
+;
+entry:
+  %call = call i8* @malloc(i64 %0) inaccessiblememonly
+  %a = and i32 %1, 32
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %cleanup, label %if.end
+if.end:
+  call void @llvm.memset.p0i8.i64(i8* %call, i8 0, i64 %0, i1 false)
+  br label %cleanup
+cleanup:
+  ret i8* %call
+}
+
 ; PR50143
 define i8* @store_zero_after_calloc_inaccessiblememonly() {
 ; CHECK-LABEL: @store_zero_after_calloc_inaccessiblememonly(
@@ -605,3 +626,52 @@ define i8* @memset_pattern16_after_calloc(i8* %pat) {
   call void @memset_pattern16(i8* %call, i8* %pat, i64 40000) #1
   ret i8* %call
 }
+
+ at n = global i32 0, align 4
+ at a = external global i32, align 4
+ at b = external global i32*, align 8
+
+; GCC calloc-1.c test case should create calloc
+define i8* @test_malloc_memset_to_calloc(i64* %0) {
+; CHECK-LABEL: @test_malloc_memset_to_calloc(
+; CHECK:       entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @n, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[CALLOC:%.*]] = call i8* @calloc(i64 1, i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP0:%.*]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8* [[CALLOC]], null
+; CHECK-NEXT:    br i1 [[TMP5]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:                                          ; preds = %entry
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    store i64 [[TMP6]], i64* [[TMP0]], align 8
+; CHECK-NEXT:    store i32 2, i32* @a, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32*, i32** @b, align 8
+; CHECK-NEXT:    store i32 3, i32* [[TMP7]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:                                           ; preds = %if.then, %entry
+; CHECK-NEXT:    ret i8* [[CALLOC]]
+;
+entry:
+  %1 = load i32, i32* @n, align 4
+  %2 = sext i32 %1 to i64
+  %3 = tail call i8* @malloc(i64 %2) inaccessiblememonly
+  %4 = load i64, i64* %0, align 8
+  %5 = add nsw i64 %4, 1
+  store i64 %5, i64* %0, align 8
+  %6 = icmp eq i8* %3, null
+  br i1 %6, label %if.end, label %if.then
+
+if.then:
+  %7 = add nsw i64 %4, 2
+  store i64 %7, i64* %0, align 8
+  store i32 2, i32* @a, align 4
+  tail call void @llvm.memset.p0i8.i64(i8* align 4 %3, i8 0, i64 %2, i1 false)
+  %8 = load i32*, i32** @b, align 8
+  store i32 3, i32* %8, align 4
+  br label %if.end
+
+if.end:
+  ret i8* %3
+}


        


More information about the llvm-commits mailing list