[llvm] r311366 - [LibCallSimplifier] try harder to fold memcmp with constant arguments (2nd try)

Mon Aug 21 12:13:15 PDT 2017

Author: spatel
Date: Mon Aug 21 12:13:14 2017
New Revision: 311366

URL: http://llvm.org/viewvc/llvm-project?rev=311366&view=rev
Log:
[LibCallSimplifier] try harder to fold memcmp with constant arguments (2nd try)

The 1st try was reverted because it could inf-loop by creating a dead instruction.
Fixed that to not happen and added a test case to verify.

Original commit message:

Try to fold:
memcmp(X, C, ConstantLength) == 0 --> load X == *C

Without this change, we're unnecessarily checking the alignment of the constant data,
so we miss the transform in the first 2 tests in the patch.

I noted this shortcoming of LibCallSimpifier in one of the recent CGP memcmp expansion
patches. This doesn't help the example in:
https://bugs.llvm.org/show_bug.cgi?id=34032#c13
...directly, but it's worth short-circuiting more of these simple cases since we're
already trying to do that.

The benefit of transforming to load+cmp is that existing IR analysis/transforms may
further simplify that code. For example, if the load of the variable is common to
multiple memcmp calls, CSE can remove the duplicate instructions.

Differential Revision: https://reviews.llvm.org/D36922

Modified:
    llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp
    llvm/trunk/test/Transforms/InstCombine/memcmp-constant-fold.ll

Modified: llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp?rev=311366&r1=311365&r2=311366&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp Mon Aug 21 12:13:14 2017
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -751,29 +752,44 @@ Value *LibCallSimplifier::optimizeMemCmp
   }
 
   // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0
+  // TODO: The case where both inputs are constants does not need to be limited
+  // to legal integers or equality comparison. See block below this.
   if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) {
-
     IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8);
     unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType);
 
-    if (getKnownAlignment(LHS, DL, CI) >= PrefAlignment &&
-        getKnownAlignment(RHS, DL, CI) >= PrefAlignment) {
-
-      Type *LHSPtrTy =
-          IntType->getPointerTo(LHS->getType()->getPointerAddressSpace());
-      Type *RHSPtrTy =
-          IntType->getPointerTo(RHS->getType()->getPointerAddressSpace());
-
-      Value *LHSV =
-          B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv");
-      Value *RHSV =
-          B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv");
+    // First, see if we can fold either argument to a constant.
+    Value *LHSV = nullptr;
+    if (auto *LHSC = dyn_cast<Constant>(LHS)) {
+      LHSC = ConstantExpr::getBitCast(LHSC, IntType->getPointerTo());
+      LHSV = ConstantFoldLoadFromConstPtr(LHSC, IntType, DL);
+    }
+    Value *RHSV = nullptr;
+    if (auto *RHSC = dyn_cast<Constant>(RHS)) {
+      RHSC = ConstantExpr::getBitCast(RHSC, IntType->getPointerTo());
+      RHSV = ConstantFoldLoadFromConstPtr(RHSC, IntType, DL);
+    }
 
+    // Don't generate unaligned loads. If either source is constant data,
+    // alignment doesn't matter for that source because there is no load.
+    if ((LHSV || getKnownAlignment(LHS, DL, CI) >= PrefAlignment) &&
+        (RHSV || getKnownAlignment(RHS, DL, CI) >= PrefAlignment)) {
+      if (!LHSV) {
+        Type *LHSPtrTy =
+            IntType->getPointerTo(LHS->getType()->getPointerAddressSpace());
+        LHSV = B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy), "lhsv");
+      }
+      if (!RHSV) {
+        Type *RHSPtrTy =
+            IntType->getPointerTo(RHS->getType()->getPointerAddressSpace());
+        RHSV = B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy), "rhsv");
+      }
       return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp");
     }
   }
 
-  // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
+  // Constant folding: memcmp(x, y, Len) -> constant (all arguments are const).
+  // TODO: This is limited to i8 arrays.
   StringRef LHSStr, RHSStr;
   if (getConstantStringInfo(LHS, LHSStr) &&
       getConstantStringInfo(RHS, RHSStr)) {

Modified: llvm/trunk/test/Transforms/InstCombine/memcmp-constant-fold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/memcmp-constant-fold.ll?rev=311366&r1=311365&r2=311366&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/memcmp-constant-fold.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/memcmp-constant-fold.ll Mon Aug 21 12:13:14 2017
@@ -3,31 +3,45 @@
 
 declare i32 @memcmp(i8*, i8*, i64)
 
-; TODO: The alignment of this constant does not matter. We constant fold the load.
+; The alignment of this constant does not matter. We constant fold the load.
 
 @charbuf = private unnamed_addr constant [4 x i8] [i8 0, i8 0, i8 0, i8 1], align 1
 
 define i1 @memcmp_4bytes_unaligned_constant_i8(i8* align 4 %x) {
-; ALL-LABEL: @memcmp_4bytes_unaligned_constant_i8(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @charbuf, i64 0, i64 0), i64 4)
-; ALL-NEXT:    [[CMPEQ0:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    ret i1 [[CMPEQ0]]
+; LE-LABEL: @memcmp_4bytes_unaligned_constant_i8(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i32*
+; LE-NEXT:    [[LHSV:%.*]] = load i32, i32* [[TMP1]], align 4
+; LE-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[LHSV]], 16777216
+; LE-NEXT:    ret i1 [[TMP2]]
+;
+; BE-LABEL: @memcmp_4bytes_unaligned_constant_i8(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i32*
+; BE-NEXT:    [[LHSV:%.*]] = load i32, i32* [[TMP1]], align 4
+; BE-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[LHSV]], 1
+; BE-NEXT:    ret i1 [[TMP2]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @charbuf, i64 0, i64 0), i64 4)
   %cmpeq0 = icmp eq i32 %call, 0
   ret i1 %cmpeq0
 }
 
-; TODO: We still don't care about alignment of the constant. We are not limited to constant folding only i8 arrays.
+; We still don't care about alignment of the constant. We are not limited to constant folding only i8 arrays.
 ; It doesn't matter if the constant operand is the first operand to the memcmp.
 
 @intbuf_unaligned = private unnamed_addr constant [4 x i16] [i16 1, i16 2, i16 3, i16 4], align 1
 
 define i1 @memcmp_4bytes_unaligned_constant_i16(i8* align 4 %x) {
-; ALL-LABEL: @memcmp_4bytes_unaligned_constant_i16(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* bitcast ([4 x i16]* @intbuf_unaligned to i8*), i8* %x, i64 4)
-; ALL-NEXT:    [[CMPEQ0:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    ret i1 [[CMPEQ0]]
+; LE-LABEL: @memcmp_4bytes_unaligned_constant_i16(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i32*
+; LE-NEXT:    [[RHSV:%.*]] = load i32, i32* [[TMP1]], align 4
+; LE-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[RHSV]], 131073
+; LE-NEXT:    ret i1 [[TMP2]]
+;
+; BE-LABEL: @memcmp_4bytes_unaligned_constant_i16(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i32*
+; BE-NEXT:    [[RHSV:%.*]] = load i32, i32* [[TMP1]], align 4
+; BE-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[RHSV]], 65538
+; BE-NEXT:    ret i1 [[TMP2]]
 ;
   %call = tail call i32 @memcmp(i8* bitcast (i16* getelementptr inbounds ([4 x i16], [4 x i16]* @intbuf_unaligned, i64 0, i64 0) to i8*), i8* %x, i64 4)
   %cmpeq0 = icmp eq i32 %call, 0
@@ -48,4 +62,19 @@ define i1 @memcmp_3bytes_aligned_constan
   %cmpeq0 = icmp eq i32 %call, 0
   ret i1 %cmpeq0
 }
+
+; A sloppy implementation would infinite loop by recreating the unused instructions.
+
+define i1 @memcmp_4bytes_one_unaligned_i8(i8* align 4 %x, i8* align 1 %y) {
+; ALL-LABEL: @memcmp_4bytes_one_unaligned_i8(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+; ALL-NEXT:    [[CMPEQ0:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    ret i1 [[CMPEQ0]]
+;
+  %bc = bitcast i8* %x to i32*
+  %lhsv = load i32, i32* %bc
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+  %cmpeq0 = icmp eq i32 %call, 0
+  ret i1 %cmpeq0
+}