[llvm] [InstCombine] Tighten use constraint in factorization transforms (PR #102943)

Thu Aug 22 13:55:52 PDT 2024

https://github.com/kalxr updated https://github.com/llvm/llvm-project/pull/102943

>From 620ce7bfb98d6e735e8629a851f192e11adab1fd Mon Sep 17 00:00:00 2001
From: Kevin McAfee <kmcafee at nvidia.com>
Date: Tue, 13 Aug 2024 14:06:42 -0700
Subject: [PATCH 1/2] Precommit - transform creates extra arg uses

---
 llvm/test/Transforms/InstCombine/add.ll | 54 +++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 417c3a950d7805..5e9e7c362321a8 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -2046,6 +2046,60 @@ define i8 @mul_add_common_factor_use(i8 %x, i8 %y) {
   ret i8 %a
 }
 
+; negative test - avoid creating extra uses of args
+
+define i8 @mul_add_common_factor_use2(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @mul_add_common_factor_use2(
+; CHECK-NEXT:    [[M:%.*]] = mul i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i8 [[M]])
+; CHECK-NEXT:    [[N1:%.*]] = add i8 [[Y]], [[Z:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = mul i8 [[X]], [[N1]]
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %m = mul i8 %x, %y
+  %n = mul i8 %x, %z
+  call void @use(i8 %m)
+  %a = add i8 %m, %n
+  ret i8 %a
+}
+
+; negative test - avoid disturbing redundant expressions for no immediate improvement
+
+define void @mul_add_chain_common_factor_uses(i64 %a, i64 %b, i32 %c) {
+; CHECK-LABEL: @mul_add_chain_common_factor_uses(
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[SXT1:%.*]] = sext i32 [[C:%.*]] to i64
+; CHECK-NEXT:    [[MUL11:%.*]] = add i64 [[A]], [[SXT1]]
+; CHECK-NEXT:    [[ADD1:%.*]] = mul i64 [[MUL11]], [[B]]
+; CHECK-NEXT:    call void @use(i64 [[ADD1]])
+; CHECK-NEXT:    [[SXT2:%.*]] = sext i32 [[C]] to i64
+; CHECK-NEXT:    [[ADD12:%.*]] = add i64 [[MUL11]], [[SXT2]]
+; CHECK-NEXT:    [[ADD2:%.*]] = mul i64 [[ADD12]], [[B]]
+; CHECK-NEXT:    call void @use(i64 [[ADD2]])
+; CHECK-NEXT:    [[SXT3:%.*]] = sext i32 [[C]] to i64
+; CHECK-NEXT:    [[ADD23:%.*]] = add i64 [[ADD12]], [[SXT3]]
+; CHECK-NEXT:    [[ADD3:%.*]] = mul i64 [[ADD23]], [[B]]
+; CHECK-NEXT:    call void @use(i64 [[ADD3]])
+; CHECK-NEXT:    call void @use(i64 [[MUL1]])
+; CHECK-NEXT:    ret void
+;
+  %mul1 = mul i64 %a, %b
+  %sxt1 = sext i32 %c to i64
+  %mul2 = mul i64 %b, %sxt1
+  %add1 = add i64 %mul1, %mul2
+  call void @use(i64 %add1)
+  %sxt2 = sext i32 %c to i64
+  %mul3 = mul i64 %b, %sxt2
+  %add2 = add i64 %add1, %mul3
+  call void @use(i64 %add2)
+  %sxt3 = sext i32 %c to i64
+  %mul4 = mul i64 %b, %sxt3
+  %add3 = add i64 %add2, %mul4
+  call void @use(i64 %add3)
+  call void @use(i64 %mul1)
+  ret void
+}
+
 define i8 @not_mul(i8 %x) {
 ; CHECK-LABEL: @not_mul(
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i8 [[X:%.*]], -41

>From f9c2f0ff0ce2da9630e07ef7a50925c433fa4d8e Mon Sep 17 00:00:00 2001
From: Kevin McAfee <kmcafee at nvidia.com>
Date: Thu, 22 Aug 2024 13:39:09 -0700
Subject: [PATCH 2/2] [InstCombine] Tighten use constraint in factorization
 transforms

---
 .../InstCombine/InstCombineAddSub.cpp         | 13 +++-
 .../InstCombine/InstructionCombining.cpp      | 14 +++--
 llvm/test/Transforms/InstCombine/add.ll       | 16 ++---
 llvm/test/Transforms/InstCombine/and-or.ll    | 12 ++--
 llvm/test/Transforms/InstCombine/ctpop.ll     | 12 ++--
 .../test/Transforms/InstCombine/shl-factor.ll | 20 +++---
 .../Transforms/LoopVectorize/induction.ll     | 62 +++++++++----------
 .../invariant-store-vectorization.ll          |  6 +-
 .../AArch64/indvars-vectorization.ll          |  6 +-
 9 files changed, 87 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d7758b5fbf1786..94c63883c0b081 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1409,7 +1409,8 @@ Instruction *InstCombinerImpl::
 /// foldUsingDistributiveLaws. If that code can be made to work optimally
 /// for multi-use cases or propagating nsw/nuw, then we would not need this.
 static Instruction *factorizeMathWithShlOps(BinaryOperator &I,
-                                            InstCombiner::BuilderTy &Builder) {
+                                            InstCombiner::BuilderTy &Builder,
+                                            const SimplifyQuery SQ) {
   // TODO: Also handle mul by doubling the shift amount?
   assert((I.getOpcode() == Instruction::Add ||
           I.getOpcode() == Instruction::Sub) &&
@@ -1424,6 +1425,12 @@ static Instruction *factorizeMathWithShlOps(BinaryOperator &I,
       !match(Op1, m_Shl(m_Value(Y), m_Specific(ShAmt))))
     return nullptr;
 
+  // This transform is only profitiable if both operations or one operation and
+  // the resulting add/sub can be eliminated/folded.
+  if (!(Op0->hasOneUse() && Op1->hasOneUse()) &&
+      !simplifyBinOp(I.getOpcode(), X, Y, SQ))
+    return nullptr;
+
   // No-wrap propagates only when all ops have no-wrap.
   bool HasNSW = I.hasNoSignedWrap() && Op0->hasNoSignedWrap() &&
                 Op1->hasNoSignedWrap();
@@ -1505,7 +1512,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *R = foldBoxMultiply(I))
     return R;
 
-  if (Instruction *R = factorizeMathWithShlOps(I, Builder))
+  if (Instruction *R = factorizeMathWithShlOps(I, Builder, SQ))
     return R;
 
   if (Instruction *X = foldAddWithConstant(I))
@@ -2166,7 +2173,7 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
   }
 
   // Try this before Negator to preserve NSW flag.
-  if (Instruction *R = factorizeMathWithShlOps(I, Builder))
+  if (Instruction *R = factorizeMathWithShlOps(I, Builder, SQ))
     return R;
 
   Constant *C;
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index c3f79fe4f901ad..5b789ebb199b46 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -688,9 +688,12 @@ static Value *tryFactorization(BinaryOperator &I, const SimplifyQuery &SQ,
       // If "B op D" simplifies then it can be formed with no cost.
       V = simplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
 
-      // If "B op D" doesn't simplify then only go on if one of the existing
+      // If "B op D" doesn't simplify then only go on if both of the existing
       // operations "A op' B" and "C op' D" will be zapped as no longer used.
-      if (!V && (LHS->hasOneUse() || RHS->hasOneUse()))
+      // Note that when an operation is equal to one of its operands, that
+      // operation is "zapped" by having never existed in the first place.
+      if (!V && (LHS->hasOneUse() || LHS == A || LHS == B) &&
+          (RHS->hasOneUse() || RHS == C || RHS == D))
         V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
       if (V)
         RetVal = Builder.CreateBinOp(InnerOpcode, A, V);
@@ -708,9 +711,12 @@ static Value *tryFactorization(BinaryOperator &I, const SimplifyQuery &SQ,
       // If "A op C" simplifies then it can be formed with no cost.
       V = simplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
 
-      // If "A op C" doesn't simplify then only go on if one of the existing
+      // If "A op C" doesn't simplify then only go on if both of the existing
       // operations "A op' B" and "C op' D" will be zapped as no longer used.
-      if (!V && (LHS->hasOneUse() || RHS->hasOneUse()))
+      // Note that when an operation is equal to one of its operands, that
+      // operation is "zapped" by having never existed in the first place.
+      if (!V && (LHS->hasOneUse() || LHS == A || LHS == B) &&
+          (RHS->hasOneUse() || RHS == C || RHS == D))
         V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
       if (V)
         RetVal = Builder.CreateBinOp(InnerOpcode, V, B);
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 5e9e7c362321a8..f854a8d6789e8c 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -2051,9 +2051,9 @@ define i8 @mul_add_common_factor_use(i8 %x, i8 %y) {
 define i8 @mul_add_common_factor_use2(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @mul_add_common_factor_use2(
 ; CHECK-NEXT:    [[M:%.*]] = mul i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[N:%.*]] = mul i8 [[X]], [[Z:%.*]]
 ; CHECK-NEXT:    call void @use(i8 [[M]])
-; CHECK-NEXT:    [[N1:%.*]] = add i8 [[Y]], [[Z:%.*]]
-; CHECK-NEXT:    [[A:%.*]] = mul i8 [[X]], [[N1]]
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[M]], [[N]]
 ; CHECK-NEXT:    ret i8 [[A]]
 ;
   %m = mul i8 %x, %y
@@ -2069,16 +2069,16 @@ define void @mul_add_chain_common_factor_uses(i64 %a, i64 %b, i32 %c) {
 ; CHECK-LABEL: @mul_add_chain_common_factor_uses(
 ; CHECK-NEXT:    [[MUL1:%.*]] = mul i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[SXT1:%.*]] = sext i32 [[C:%.*]] to i64
-; CHECK-NEXT:    [[MUL11:%.*]] = add i64 [[A]], [[SXT1]]
-; CHECK-NEXT:    [[ADD1:%.*]] = mul i64 [[MUL11]], [[B]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i64 [[B]], [[SXT1]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[MUL1]], [[MUL2]]
 ; CHECK-NEXT:    call void @use(i64 [[ADD1]])
 ; CHECK-NEXT:    [[SXT2:%.*]] = sext i32 [[C]] to i64
-; CHECK-NEXT:    [[ADD12:%.*]] = add i64 [[MUL11]], [[SXT2]]
-; CHECK-NEXT:    [[ADD2:%.*]] = mul i64 [[ADD12]], [[B]]
+; CHECK-NEXT:    [[MUL3:%.*]] = mul i64 [[B]], [[SXT2]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[ADD1]], [[MUL3]]
 ; CHECK-NEXT:    call void @use(i64 [[ADD2]])
 ; CHECK-NEXT:    [[SXT3:%.*]] = sext i32 [[C]] to i64
-; CHECK-NEXT:    [[ADD23:%.*]] = add i64 [[ADD12]], [[SXT3]]
-; CHECK-NEXT:    [[ADD3:%.*]] = mul i64 [[ADD23]], [[B]]
+; CHECK-NEXT:    [[MUL4:%.*]] = mul i64 [[B]], [[SXT3]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i64 [[ADD2]], [[MUL4]]
 ; CHECK-NEXT:    call void @use(i64 [[ADD3]])
 ; CHECK-NEXT:    call void @use(i64 [[MUL1]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/and-or.ll b/llvm/test/Transforms/InstCombine/and-or.ll
index fee055a2e12451..ca8d922d56cd50 100644
--- a/llvm/test/Transforms/InstCombine/and-or.ll
+++ b/llvm/test/Transforms/InstCombine/and-or.ll
@@ -685,11 +685,11 @@ define i32 @or_or_and_noOneUse_fail1(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[A:%.*]], 23
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SHR]], 157
 ; CHECK-NEXT:    call void @use2(i32 [[AND]])
-; CHECK-NEXT:    [[AND1:%.*]] = or i32 [[B:%.*]], 157
-; CHECK-NEXT:    [[OR:%.*]] = and i32 [[SHR]], [[AND1]]
+; CHECK-NEXT:    [[AND3:%.*]] = and i32 [[SHR]], [[B:%.*]]
 ; CHECK-NEXT:    [[SHR8:%.*]] = lshr i32 [[B]], 23
 ; CHECK-NEXT:    [[AND9:%.*]] = and i32 [[SHR8]], 157
-; CHECK-NEXT:    [[R:%.*]] = or i32 [[OR]], [[AND9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[AND3]], [[AND9]]
+; CHECK-NEXT:    [[R:%.*]] = or i32 [[TMP1]], [[AND]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %shr = ashr i32 %a, 23
@@ -714,9 +714,9 @@ define { i1, i1, i1, i1, i1 } @or_or_and_noOneUse_fail2(i1 %a_0, i1 %a_1, i1 %a_
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i1 [[A_1:%.*]], [[B_1:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[TMP0]], [[A_1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[A_1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and i1 [[TMP6]], [[B_1]]
-; CHECK-NEXT:    [[D:%.*]] = or i1 [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[TMP2]], [[B_1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[D:%.*]] = or i1 [[TMP7]], [[TMP3]]
 ; CHECK-NEXT:    [[DOTNOT1:%.*]] = or i1 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { i1, i1, i1, i1, i1 } zeroinitializer, i1 [[D]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { i1, i1, i1, i1, i1 } [[TMP8]], i1 [[TMP4]], 1
diff --git a/llvm/test/Transforms/InstCombine/ctpop.ll b/llvm/test/Transforms/InstCombine/ctpop.ll
index 940bb868c4c615..3177625b5cfb68 100644
--- a/llvm/test/Transforms/InstCombine/ctpop.ll
+++ b/llvm/test/Transforms/InstCombine/ctpop.ll
@@ -442,9 +442,9 @@ define i32 @parity_xor_extra_use(i32 %arg, i32 %arg1) {
 ; CHECK-NEXT:    [[I:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    [[I2:%.*]] = and i32 [[I]], 1
 ; CHECK-NEXT:    tail call void @use(i32 [[I2]])
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[ARG1:%.*]], [[ARG]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[TMP1]])
-; CHECK-NEXT:    [[I5:%.*]] = and i32 [[TMP2]], 1
+; CHECK-NEXT:    [[I3:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[ARG1:%.*]])
+; CHECK-NEXT:    [[I4:%.*]] = and i32 [[I3]], 1
+; CHECK-NEXT:    [[I5:%.*]] = xor i32 [[I4]], [[I2]]
 ; CHECK-NEXT:    ret i32 [[I5]]
 ;
   %i = tail call i32 @llvm.ctpop.i32(i32 %arg)
@@ -461,9 +461,9 @@ define i32 @parity_xor_extra_use2(i32 %arg, i32 %arg1) {
 ; CHECK-NEXT:    [[I:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[ARG1:%.*]])
 ; CHECK-NEXT:    [[I2:%.*]] = and i32 [[I]], 1
 ; CHECK-NEXT:    tail call void @use(i32 [[I2]])
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[ARG1]], [[ARG:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[TMP1]])
-; CHECK-NEXT:    [[I5:%.*]] = and i32 [[TMP2]], 1
+; CHECK-NEXT:    [[I3:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[I4:%.*]] = and i32 [[I3]], 1
+; CHECK-NEXT:    [[I5:%.*]] = xor i32 [[I2]], [[I4]]
 ; CHECK-NEXT:    ret i32 [[I5]]
 ;
   %i = tail call i32 @llvm.ctpop.i32(i32 %arg1)
diff --git a/llvm/test/Transforms/InstCombine/shl-factor.ll b/llvm/test/Transforms/InstCombine/shl-factor.ll
index 996b15f27f6d34..dcdd5020f76645 100644
--- a/llvm/test/Transforms/InstCombine/shl-factor.ll
+++ b/llvm/test/Transforms/InstCombine/shl-factor.ll
@@ -43,8 +43,8 @@ define i8 @add_shl_same_amount_nsw_extra_use1(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @add_shl_same_amount_nsw_extra_use1(
 ; CHECK-NEXT:    [[XS:%.*]] = shl nuw nsw i8 [[X:%.*]], [[Z:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[XS]])
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i8 [[X]], [[Y:%.*]]
-; CHECK-NEXT:    [[DIFF:%.*]] = shl nsw i8 [[TMP1]], [[Z]]
+; CHECK-NEXT:    [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[DIFF:%.*]] = add nsw i8 [[XS]], [[YS]]
 ; CHECK-NEXT:    ret i8 [[DIFF]]
 ;
   %xs = shl nsw nuw i8 %x, %z
@@ -56,10 +56,10 @@ define i8 @add_shl_same_amount_nsw_extra_use1(i8 %x, i8 %y, i8 %z) {
 
 define i8 @add_shl_same_amount_nuw_extra_use2(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @add_shl_same_amount_nuw_extra_use2(
-; CHECK-NEXT:    [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[XS:%.*]] = shl nuw i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]]
 ; CHECK-NEXT:    call void @use8(i8 [[YS]])
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i8 [[X:%.*]], [[Y]]
-; CHECK-NEXT:    [[DIFF:%.*]] = shl nuw i8 [[TMP1]], [[Z]]
+; CHECK-NEXT:    [[DIFF:%.*]] = add nuw nsw i8 [[XS]], [[YS]]
 ; CHECK-NEXT:    ret i8 [[DIFF]]
 ;
   %xs = shl nuw i8 %x, %z
@@ -174,8 +174,8 @@ define i8 @sub_shl_same_amount_nsw_extra_use1(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @sub_shl_same_amount_nsw_extra_use1(
 ; CHECK-NEXT:    [[XS:%.*]] = shl nuw nsw i8 [[X:%.*]], [[Z:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[XS]])
-; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i8 [[X]], [[Y:%.*]]
-; CHECK-NEXT:    [[DIFF:%.*]] = shl nsw i8 [[TMP1]], [[Z]]
+; CHECK-NEXT:    [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sub nsw i8 [[XS]], [[YS]]
 ; CHECK-NEXT:    ret i8 [[DIFF]]
 ;
   %xs = shl nsw nuw i8 %x, %z
@@ -187,10 +187,10 @@ define i8 @sub_shl_same_amount_nsw_extra_use1(i8 %x, i8 %y, i8 %z) {
 
 define i8 @sub_shl_same_amount_nuw_extra_use2(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @sub_shl_same_amount_nuw_extra_use2(
-; CHECK-NEXT:    [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[XS:%.*]] = shl nuw i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[YS:%.*]] = shl nuw nsw i8 [[Y:%.*]], [[Z]]
 ; CHECK-NEXT:    call void @use8(i8 [[YS]])
-; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw i8 [[X:%.*]], [[Y]]
-; CHECK-NEXT:    [[DIFF:%.*]] = shl nuw i8 [[TMP1]], [[Z]]
+; CHECK-NEXT:    [[DIFF:%.*]] = sub nuw nsw i8 [[XS]], [[YS]]
 ; CHECK-NEXT:    ret i8 [[DIFF]]
 ;
   %xs = shl nuw i8 %x, %z
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 08d05a1e2db69f..5ea708bb295803 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -3910,8 +3910,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 510
 ; IND-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; IND-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]]
-; IND-NEXT:    [[EXT_MUL5:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
-; IND-NEXT:    [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL5]], 2
+; IND-NEXT:    [[TMP10:%.*]] = shl nuw nsw i32 [[N_VEC]], 2
+; IND-NEXT:    [[IND_END1:%.*]] = add nuw nsw i32 [[EXT_MUL]], [[TMP10]]
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
@@ -3921,13 +3921,13 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8
 ; IND-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]]
-; IND-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
-; IND-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
+; IND-NEXT:    [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64
+; IND-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP12]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
-; IND-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; IND-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; IND-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IND-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -3940,8 +3940,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; IND-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; IND-NEXT:    [[TMP13:%.*]] = sext i8 [[IDX]] to i64
-; IND-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
+; IND-NEXT:    [[TMP14:%.*]] = sext i8 [[IDX]] to i64
+; IND-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
 ; IND-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; IND-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; IND-NEXT:    [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32
@@ -3979,8 +3979,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 508
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]]
-; UNROLL-NEXT:    [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
-; UNROLL-NEXT:    [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2
+; UNROLL-NEXT:    [[TMP10:%.*]] = shl nuw nsw i32 [[N_VEC]], 2
+; UNROLL-NEXT:    [[IND_END1:%.*]] = add nuw nsw i32 [[EXT_MUL]], [[TMP10]]
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
@@ -3991,15 +3991,15 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
 ; UNROLL-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]]
-; UNROLL-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
-; UNROLL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 8
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
-; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
+; UNROLL-NEXT:    [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64
+; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; UNROLL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 8
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP12]], align 4
+; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP13]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 16, i32 16>
-; UNROLL-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -4012,8 +4012,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; UNROLL-NEXT:    [[TMP14:%.*]] = sext i8 [[IDX]] to i64
-; UNROLL-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; UNROLL-NEXT:    [[TMP15:%.*]] = sext i8 [[IDX]] to i64
+; UNROLL-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
 ; UNROLL-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; UNROLL-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; UNROLL-NEXT:    [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32
@@ -4129,8 +4129,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], 504
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]]
-; INTERLEAVE-NEXT:    [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]]
-; INTERLEAVE-NEXT:    [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = shl nuw nsw i32 [[N_VEC]], 2
+; INTERLEAVE-NEXT:    [[IND_END1:%.*]] = add nuw nsw i32 [[EXT_MUL]], [[TMP10]]
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 4, i32 8, i32 12>
@@ -4141,15 +4141,15 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 16, i32 16, i32 16, i32 16>
 ; INTERLEAVE-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]]
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
-; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = sext i8 [[OFFSET_IDX]] to i64
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP11]]
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 16
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP12]], align 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP13]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 32, i32 32, i32 32, i32 32>
-; INTERLEAVE-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -4162,8 +4162,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; INTERLEAVE-NEXT:    [[TMP14:%.*]] = sext i8 [[IDX]] to i64
-; INTERLEAVE-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; INTERLEAVE-NEXT:    [[TMP15:%.*]] = sext i8 [[IDX]] to i64
+; INTERLEAVE-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
 ; INTERLEAVE-NEXT:    store i32 [[SPHI]], ptr [[PTR]], align 4
 ; INTERLEAVE-NEXT:    [[IDX_INC]] = add i8 [[IDX]], 1
 ; INTERLEAVE-NEXT:    [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 63381454cc5900..e520899fb6d16b 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -371,9 +371,9 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[J_022]], -1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[ITR]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2
-; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SCEVGEP2]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SCEVGEP2]], i64 [[TMP9]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[VAR1]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index af24a9ab9e3f7e..0b330da54a12bd 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -37,10 +37,10 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK-NEXT:    [[UMIN:%.*]] = zext i1 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP10]], [[UMIN]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 [[SMAX]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], [[UMIN]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP13]], [[UMIN]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 4
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP16]], [[TMP9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[TMP27]], 4
 ; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]