[llvm] [InstCombine] Don't use dominating conditions to transform sub into xor. (PR #88566)

Tue Apr 16 21:45:29 PDT 2024

https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/88566

>From 0c05324f9801b1eedc712f81e6ef5b0bc7fbcc34 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Apr 2024 12:35:40 -0700
Subject: [PATCH 1/3] [InstCombine] Add test case for turning sub into xor
 using dominating condition. NFC

I plan to disable using dominating conditions for turning sub into
xor, but first we need that demonstrates it currently happens.
---
 llvm/test/Transforms/InstCombine/sub-xor.ll | 23 +++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/sub-xor.ll b/llvm/test/Transforms/InstCombine/sub-xor.ll
index 71da73d51ae37e..b4e87d0405fc48 100644
--- a/llvm/test/Transforms/InstCombine/sub-xor.ll
+++ b/llvm/test/Transforms/InstCombine/sub-xor.ll
@@ -157,3 +157,26 @@ define <2 x i8> @xor_add_splat_undef(<2 x i8> %x) {
   %add = add <2 x i8> %xor, <i8 42, i8 42>
   ret <2 x i8> %add
 }
+
+define i32 @xor_dominating_cond(i32 %x) {
+; CHECK-LABEL: @xor_dominating_cond(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[X:%.*]], 256
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[A:%.*]] = xor i32 [[X]], 255
+; CHECK-NEXT:    ret i32 [[A]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  %cond = icmp ult i32 %x, 256
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  %a = sub i32 255, %x
+  ret i32 %a
+
+if.end:
+  ret i32 %x
+}

>From 2e86abe8783ddb550cbccf088776891bb45a4d4a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 16 Apr 2024 21:25:28 -0700
Subject: [PATCH 2/3] [InstCombine] Add phase ordering test for #88239. NFC

---
 .../Transforms/PhaseOrdering/X86/pr88239.ll   | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll
new file mode 100644
index 00000000000000..3afa1904fb249a
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes="default<O3>" -mcpu=skx -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(ptr noalias noundef %0, ptr noalias noundef %1) optsize {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[TMP0:%.*]], ptr noalias nocapture noundef writeonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    br label [[TMP4:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[TMP4]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[TMP2]] ], [ [[VEC_IND_NEXT:%.*]], [[TMP4]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[VEC_IND]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[TMP6]], <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[TMP4]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    ret void
+;
+  br label %3
+
+3:                                                ; preds = %7, %2
+  %4 = phi i32 [ 0, %2 ], [ %15, %7 ]
+  %5 = icmp slt i32 %4, 256
+  br i1 %5, label %7, label %6
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = sub nsw i32 255, %4
+  %9 = zext nneg i32 %8 to i64
+  %10 = getelementptr inbounds i32, ptr %0, i64 %9
+  %11 = load i32, ptr %10, align 4
+  %12 = add nsw i32 %11, 5
+  %13 = sext i32 %4 to i64
+  %14 = getelementptr inbounds i32, ptr %1, i64 %13
+  store i32 %12, ptr %14, align 4
+  %15 = add nsw i32 %4, 1
+  br label %3
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.

>From bba4ceb82f63f1d46ca11b397f64c34d242463a2 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Apr 2024 12:38:02 -0700
Subject: [PATCH 3/3] [InstCombine] Don't use dominating conditions to
 transform sub into xor.

Other passes are unable to reverse this transform if we use dominating
conditions.

Fixes #88239.
---
 llvm/include/llvm/Analysis/SimplifyQuery.h          |  6 ++++++
 .../Transforms/InstCombine/InstCombineAddSub.cpp    |  6 ++++--
 llvm/test/Transforms/InstCombine/sub-xor.ll         |  4 +++-
 llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll   | 13 ++++++-------
 4 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Analysis/SimplifyQuery.h b/llvm/include/llvm/Analysis/SimplifyQuery.h
index e5e6ae0d3d8e3e..a10c0dc49fa22e 100644
--- a/llvm/include/llvm/Analysis/SimplifyQuery.h
+++ b/llvm/include/llvm/Analysis/SimplifyQuery.h
@@ -113,6 +113,12 @@ struct SimplifyQuery {
     using namespace PatternMatch;
     return match(V, m_Undef());
   }
+
+  SimplifyQuery getWithoutDomCondCache() const {
+    SimplifyQuery Copy(*this);
+    Copy.DC = nullptr;
+    return Copy;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 07c50d866544b3..c8734bc00c4b59 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2281,8 +2281,10 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
   if (match(Op0, m_APInt(Op0C))) {
     if (Op0C->isMask()) {
       // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
-      // zero.
-      KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
+      // zero. We don't use information from dominating conditions so this
+      // transform is easier to reverse if necessary.
+      KnownBits RHSKnown = llvm::computeKnownBits(
+          Op1, 0, SQ.getWithInstruction(&I).getWithoutDomCondCache());
       if ((*Op0C | RHSKnown.Zero).isAllOnes())
         return BinaryOperator::CreateXor(Op1, Op0);
     }
diff --git a/llvm/test/Transforms/InstCombine/sub-xor.ll b/llvm/test/Transforms/InstCombine/sub-xor.ll
index b4e87d0405fc48..2976598e043fee 100644
--- a/llvm/test/Transforms/InstCombine/sub-xor.ll
+++ b/llvm/test/Transforms/InstCombine/sub-xor.ll
@@ -158,13 +158,15 @@ define <2 x i8> @xor_add_splat_undef(<2 x i8> %x) {
   ret <2 x i8> %add
 }
 
+; Make sure we don't convert sub to xor using dominating condition. That makes
+; it hard for other passe to reverse.
 define i32 @xor_dominating_cond(i32 %x) {
 ; CHECK-LABEL: @xor_dominating_cond(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[X:%.*]], 256
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[A:%.*]] = xor i32 [[X]], 255
+; CHECK-NEXT:    [[A:%.*]] = sub nuw nsw i32 255, [[X]]
 ; CHECK-NEXT:    ret i32 [[A]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret i32 [[X]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll
index 3afa1904fb249a..1c16b37144f1eb 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll
@@ -8,19 +8,18 @@ define void @foo(ptr noalias noundef %0, ptr noalias noundef %1) optsize {
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: ptr noalias nocapture noundef readonly [[TMP0:%.*]], ptr noalias nocapture noundef writeonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -28
 ; CHECK-NEXT:    br label [[TMP4:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[TMP4]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[TMP2]] ], [ [[VEC_IND_NEXT:%.*]], [[TMP4]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[VEC_IND]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[TMP6]], <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nuw nsw i64 255, [[INDVARS_IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[INVARIANT_GEP]], i64 [[TMP3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = load <8 x i32>, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[TMP4]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block: