[llvm] [LLVM] Improve the DemandedBits Analysis (PR #148853)

Tue Jul 15 07:12:33 PDT 2025

https://github.com/karouzakisp created https://github.com/llvm/llvm-project/pull/148853

This patch adds support for missing operators inside the DemandedBits Analysis. Those operators are SDiv, UDiv, URem, SRem. Also, other operators such as Shl and Ashr are improved to handle non constant argument shift amount. Multiplication is also improved. Comparison with the upstream version of llvm with the Oz pipeline showed up to 10% code size reduction in the llvm test suite.

>From 5fdd1f05c679455a4c84b12f76b6628548262174 Mon Sep 17 00:00:00 2001
From: Panagiotis Karouzakis <karouzakispan at gmail.com>
Date: Mon, 14 Jul 2025 19:45:13 +0300
Subject: [PATCH] [LLVM] Improve the DemandedBits Analysis

This patch adds support for missing operators inside the DemandedBits
Analysis. Those operators are SDiv, UDiv, URem, SRem. Also,
other operators such as Shl and Ashr are improved to handle non constant
argument shift amount. Multiplication is also improved.
Comparison with the upstream version of llvm with the Oz pipeline
showed up to 10% code size reduction in the llvm test suite.
---
 llvm/lib/Analysis/DemandedBits.cpp            |  97 ++++++-
 llvm/test/Analysis/DemandedBits/basic.ll      |  25 ++
 llvm/test/Analysis/DemandedBits/div_rem.ll    | 261 ++++++++++++++++++
 llvm/test/Analysis/DemandedBits/shl.ll        |  48 +++-
 .../AArch64/induction-costs-sve.ll            |  38 +--
 .../scalable-trunc-min-bitwidth.ll            |   6 +-
 6 files changed, 448 insertions(+), 27 deletions(-)
 create mode 100644 llvm/test/Analysis/DemandedBits/div_rem.ll

diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index 6694d5cc06c8c..1fa94e95cbceb 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdint>
@@ -164,10 +165,24 @@ void DemandedBits::determineLiveOperandBits(
     }
     break;
   case Instruction::Mul:
-    // Find the highest live output bit. We don't need any more input
-    // bits than that (adds, and thus subtracts, ripple only to the
-    // left).
-    AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits());
+    const APInt *C;
+    if (OperandNo == 0) {
+      // to have output bits 0...H-1 we need the input bits
+      // 0...(H - ceiling(log_2))
+      if (match(UserI->getOperand(1), m_APInt(C))) {
+        auto LogC = C->isOne() ? 0 : C->logBase2() + 1;
+        unsigned Need =
+            AOut.getActiveBits() > LogC ? AOut.getActiveBits() - LogC : 0;
+        AB = APInt::getLowBitsSet(BitWidth, Need);
+      } else { // TODO: we can possibly check for Op0 constant too
+        AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits());
+      }
+    } else {
+      // Find the highest live output bit. We don't need any more input
+      // bits than that (adds, and thus subtracts, ripple only to the
+      // left).
+      AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits());
+    }
     break;
   case Instruction::Shl:
     if (OperandNo == 0) {
@@ -183,6 +198,17 @@ void DemandedBits::determineLiveOperandBits(
           AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
         else if (S->hasNoUnsignedWrap())
           AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+      } else {
+        ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
+        unsigned Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
+        unsigned Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
+        // similar to Lshr case
+        AB = (AOut.lshr(Min) | AOut.lshr(Max));
+        const auto *S = cast<ShlOperator>(UserI);
+        if (S->hasNoSignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, Max + 1);
+        else if (S->hasNoUnsignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, Max);
       }
     }
     break;
@@ -197,6 +223,19 @@ void DemandedBits::determineLiveOperandBits(
         // (they must be zero).
         if (cast<LShrOperator>(UserI)->isExact())
           AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      } else {
+        ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
+        unsigned Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
+        unsigned Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
+        // Suppose AOut == 0b0000 0011
+        // [min, max] = [1, 3]
+        // shift by 1 we get 0b0000 0110
+        // shift by 2 we get 0b0000 1100
+        // shift by 3 we get 0b0001 1000
+        // we take the or here because need to cover all the above possibilities
+        AB = (AOut.shl(Min) | AOut.shl(Max));
+        if (cast<LShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, Max);
       }
     }
     break;
@@ -217,6 +256,27 @@ void DemandedBits::determineLiveOperandBits(
         // (they must be zero).
         if (cast<AShrOperator>(UserI)->isExact())
           AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      } else {
+        ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
+        unsigned Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
+        unsigned Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
+        AB = (AOut.shl(Min) | AOut.shl(Max));
+
+        if (Max) {
+          // Suppose AOut = 0011 1100
+          // [min, max] = [1, 3]
+          // ShiftAmount = 1 : Mask is 1000 0000
+          // ShiftAmount = 2 : Mask is 1100 0000
+          // ShiftAmount = 3 : Mask is 1110 0000
+          // The Mask with Max covers every case in [min, max],
+          // so we are done
+          if ((AOut & APInt::getHighBitsSet(BitWidth, Max)).getBoolValue())
+            AB.setSignBit();
+        }
+        // If the shift is exact, then the low bits are not dead
+        // (they must be zero).
+        if (cast<AShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, Max);
       }
     }
     break;
@@ -246,6 +306,35 @@ void DemandedBits::determineLiveOperandBits(
     else
       AB &= ~(Known.One & ~Known2.One);
     break;
+  case Instruction::UDiv:
+  case Instruction::URem:
+  case Instruction::SDiv:
+  case Instruction::SRem: {
+    auto Opc = UserI->getOpcode();
+    auto IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
+    bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
+    if (OperandNo == 0) {
+      const APInt *DivAmnt;
+      if (match(UserI->getOperand(1), m_APInt(DivAmnt))) {
+        uint64_t D = DivAmnt->getZExtValue();
+        if (isPowerOf2_64(D)) {
+          unsigned Sh = Log2_64(D);
+          if (IsDiv) {
+            AB = AOut.shl(Sh);
+          } else {
+            AB = AOut & APInt::getLowBitsSet(BitWidth, Sh);
+          }
+        } else { // Non power of 2 constant div
+          unsigned LowQ = AOut.getActiveBits();
+          unsigned Need = LowQ + Log2_64_Ceil(D);
+          if (IsSigned)
+            Need++;
+          AB = APInt::getLowBitsSet(BitWidth, std::min(BitWidth, Need));
+        }
+      }
+    }
+    break;
+  }
   case Instruction::Xor:
   case Instruction::PHI:
     AB = AOut;
diff --git a/llvm/test/Analysis/DemandedBits/basic.ll b/llvm/test/Analysis/DemandedBits/basic.ll
index 4dc59c5392935..62eba9eaa81c5 100644
--- a/llvm/test/Analysis/DemandedBits/basic.ll
+++ b/llvm/test/Analysis/DemandedBits/basic.ll
@@ -25,3 +25,28 @@ define i8 @test_mul(i32 %a, i32 %b) {
   %6 = add nsw i8 %3, %5
   ret i8 %6
 }
+; CHECK-LABEL: Printing analysis 'Demanded Bits Analysis' for function 'test_mul_constant':
+; CHECK-DAG: DemandedBits: 0xff for   %3 = trunc i32 %2 to i8
+; CHECK-DAG: DemandedBits: 0xff for %2 in   %3 = trunc i32 %2 to i8
+; CHECK-DAG: DemandedBits: 0xff for   %2 = mul nsw i32 %1, 6
+; CHECK-DAG: DemandedBits: 0x1f for %1 in   %2 = mul nsw i32 %1, 6
+; CHECK-DAG: DemandedBits: 0xff for 6 in   %2 = mul nsw i32 %1, 6
+; CHECK-DAG: DemandedBits: 0x1 for   %4 = trunc i32 %2 to i1
+; CHECK-DAG: DemandedBits: 0x1 for %2 in   %4 = trunc i32 %2 to i1
+; CHECK-DAG: DemandedBits: 0x1f for   %1 = add nsw i32 %a, 12
+; CHECK-DAG: DemandedBits: 0x1f for %a in   %1 = add nsw i32 %a, 12
+; CHECK-DAG: DemandedBits: 0x1f for 12 in   %1 = add nsw i32 %a, 12
+; CHECK-DAG: DemandedBits: 0xff for   %5 = zext i1 %4 to i8
+; CHECK-DAG: DemandedBits: 0x1 for %4 in   %5 = zext i1 %4 to i8
+; CHECK-DAG: DemandedBits: 0xff for   %6 = add nsw i8 %3, %5
+; CHECK-DAG: DemandedBits: 0xff for %3 in   %6 = add nsw i8 %3, %5
+; CHECK-DAG: DemandedBits: 0xff for %5 in   %6 = add nsw i8 %3, %5
+define i8 @test_mul_constant(i32 %a, i32 %b){
+  %1 = add nsw i32 %a, 12
+  %2 = mul nsw i32 %1, 6
+  %3 = trunc i32 %2 to i8
+  %4 = trunc i32 %2 to i1
+  %5 = zext i1 %4 to i8
+  %6 = add nsw i8 %3, %5
+  ret i8 %6
+}
diff --git a/llvm/test/Analysis/DemandedBits/div_rem.ll b/llvm/test/Analysis/DemandedBits/div_rem.ll
new file mode 100644
index 0000000000000..818cba17dc1a6
--- /dev/null
+++ b/llvm/test/Analysis/DemandedBits/div_rem.ll
@@ -0,0 +1,261 @@
+; RUN: opt -S -disable-output -passes="print<demanded-bits>" < %s 2>&1 | FileCheck %s
+
+define i8 @test_sdiv_const_amount_4(i32 %a) {
+; CHECK-LABEL: 'test_sdiv_const_amount_4'
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for   %div = sdiv i32 %a, 4
+; CHECK-DAG: DemandedBits: 0x3fc for %a in   %div = sdiv i32 %a, 4
+; CHECK-DAG: DemandedBits: 0xffffffff for 4 in   %div = sdiv i32 %a, 4
+;
+  %div = sdiv i32 %a, 4
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_sdiv_const_amount_5(i32 %a) {
+; CHECK-LABEL: 'test_sdiv_const_amount_5'
+; CHECK-DAG: DemandedBits: 0xff for   %div = sdiv i32 %a, 5
+; CHECK-DAG: DemandedBits: 0xfff for %a in   %div = sdiv i32 %a, 5
+; CHECK-DAG: DemandedBits: 0xffffffff for 5 in   %div = sdiv i32 %a, 5
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+;
+  %div = sdiv i32 %a, 5
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_sdiv_const_amount_8(i32 %a) {
+; CHECK-LABEL: 'test_sdiv_const_amount_8'
+; CHECK-DAG: DemandedBits: 0xff for   %div = sdiv i32 %a, 8
+; CHECK-DAG: DemandedBits: 0x7f8 for %a in   %div = sdiv i32 %a, 8
+; CHECK-DAG: DemandedBits: 0xffffffff for 8 in   %div = sdiv i32 %a, 8
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+;
+  %div = sdiv i32 %a, 8
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_sdiv_const_amount_9(i32 %a) {
+; CHECK-LABEL: 'test_sdiv_const_amount_9'
+; CHECK-DAG: DemandedBits: 0xff for   %div = udiv i32 %a, 9
+; CHECK-DAG: DemandedBits: 0xfff for %a in   %div = udiv i32 %a, 9
+; CHECK-DAG: DemandedBits: 0xffffffff for 9 in   %div = udiv i32 %a, 9
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+;
+  %div = udiv i32 %a, 9
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_sdiv(i32 %a, i32 %b) {
+; CHECK-LABEL: 'test_sdiv'
+; CHECK-DAG: DemandedBits: 0xff for   %div = sdiv i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xffffffff for %a in   %div = sdiv i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xffffffff for %b in   %div = sdiv i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+;
+  %div = sdiv i32 %a, %b
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_udiv_const_amount_4(i32 %a) {
+; CHECK-LABEL: 'test_udiv_const_amount_4'
+; CHECK-DAG: DemandedBits: 0xff for   %div = udiv i32 %a, 4
+; CHECK-DAG: DemandedBits: 0x3fc for %a in   %div = udiv i32 %a, 4
+; CHECK-DAG: DemandedBits: 0xffffffff for 4 in   %div = udiv i32 %a, 4
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+;
+  %div = udiv i32 %a, 4
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_udiv_const_amount_5(i32 %a) {
+; CHECK-LABEL: 'test_udiv_const_amount_5'
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for   %div = udiv i32 %a, 5
+; CHECK-DAG: DemandedBits: 0x7ff for %a in   %div = udiv i32 %a, 5
+; CHECK-DAG: DemandedBits: 0xffffffff for 5 in   %div = udiv i32 %a, 5
+;
+  %div = udiv i32 %a, 5
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_udiv_const_amount_8(i32 %a) {
+; CHECK-LABEL: 'test_udiv_const_amount_8'
+; CHECK-DAG: DemandedBits: 0xff for   %div = udiv i32 %a, 8
+; CHECK-DAG: DemandedBits: 0x7f8 for %a in   %div = udiv i32 %a, 8
+; CHECK-DAG: DemandedBits: 0xffffffff for 8 in   %div = udiv i32 %a, 8
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+;
+  %div = udiv i32 %a, 8
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_udiv_const_amount_9(i32 %a) {
+; CHECK-LABEL: 'test_udiv_const_amount_9'
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for   %div = udiv i32 %a, 9
+; CHECK-DAG: DemandedBits: 0xfff for %a in   %div = udiv i32 %a, 9
+; CHECK-DAG: DemandedBits: 0xffffffff for 9 in   %div = udiv i32 %a, 9
+;
+  %div = udiv i32 %a, 9
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_udiv(i32 %a, i32 %b) {
+; CHECK-LABEL: 'test_udiv'
+; CHECK-DAG: DemandedBits: 0xff for   %div = udiv i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xffffffff for %a in   %div = udiv i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xffffffff for %b in   %div = udiv i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xff for   %div.t = trunc i32 %div to i8
+; CHECK-DAG: DemandedBits: 0xff for %div in   %div.t = trunc i32 %div to i8
+;
+  %div = udiv i32 %a, %b
+  %div.t = trunc i32 %div to i8
+  ret i8 %div.t
+}
+
+define i8 @test_srem_const_amount_4(i32 %a) {
+; CHECK-LABEL: 'test_srem_const_amount_4'
+; CHECK-DAG:  DemandedBits: 0xff for   %rem = srem i32 %a, 4
+; CHECK-DAG:  DemandedBits: 0x3 for %a in   %rem = srem i32 %a, 4
+; CHECK-DAG:  DemandedBits: 0xffffffff for 4 in   %rem = srem i32 %a, 4
+; CHECK-DAG:  DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG:  DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+;
+  %rem = srem i32 %a, 4
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
+
+define i8 @test_srem_const_amount_5(i32 %a) {
+; CHECK-LABEL: 'test_srem_const_amount_5'
+; CHECK-DAG: DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for   %rem = srem i32 %a, 5
+; CHECK-DAG: DemandedBits: 0xfff for %a in   %rem = srem i32 %a, 5
+; CHECK-DAG: DemandedBits: 0xffffffff for 5 in   %rem = srem i32 %a, 5
+;
+  %rem = srem i32 %a, 5
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
+
+define i8 @test_srem_const_amount_8(i32 %a) {
+; CHECK-LABEL: 'test_srem_const_amount_8'
+; CHECK-DAG: DemandedBits: 0xff for   %rem = srem i32 %a, 8
+; CHECK-DAG: DemandedBits: 0x7 for %a in   %rem = srem i32 %a, 8
+; CHECK-DAG: DemandedBits: 0xffffffff for 8 in   %rem = srem i32 %a, 8
+; CHECK-DAG: DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+;
+  %rem = srem i32 %a, 8
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
+
+define i8 @test_srem_const_amount_9(i32 %a) {
+; CHECK-LABEL: 'test_srem_const_amount_9'
+; CHECK-DAG: DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for   %rem = srem i32 %a, 9
+; CHECK-DAG: DemandedBits: 0x1fff for %a in   %rem = srem i32 %a, 9
+; CHECK-DAG: DemandedBits: 0xffffffff for 9 in   %rem = srem i32 %a, 9
+;
+  %rem = srem i32 %a, 9
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
+
+define i8 @test_srem(i32 %a, i32 %b) {
+; CHECK-LABEL: 'test_srem'
+; CHECK-DAG: DemandedBits: 0xff for   %rem = srem i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xffffffff for %a in   %rem = srem i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xffffffff for %b in   %rem = srem i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+;
+  %rem = srem i32 %a, %b
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
+
+define i8 @test_urem_const_amount_4(i32 %a) {
+; CHECK-LABEL: 'test_urem_const_amount_4'
+; CHECK-DAG: DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for   %rem = urem i32 %a, 4
+; CHECK-DAG: DemandedBits: 0x3 for %a in   %rem = urem i32 %a, 4
+; CHECK-DAG: DemandedBits: 0xffffffff for 4 in   %rem = urem i32 %a, 4
+;
+  %rem = urem i32 %a, 4
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
+
+define i8 @test_urem_const_amount_5(i32 %a) {
+; CHECK-LABEL: 'test_urem_const_amount_5'
+; CHECK-DAG: DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for   %rem = urem i32 %a, 5
+; CHECK-DAG: DemandedBits: 0x7ff for %a in   %rem = urem i32 %a, 5
+; CHECK-DAG: DemandedBits: 0xffffffff for 5 in   %rem = urem i32 %a, 5
+;
+  %rem = urem i32 %a, 5
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
+
+define i8 @test_urem_const_amount_8(i32 %a) {
+; CHECK-LABEL: 'test_urem_const_amount_8'
+; CHECK-DAG: DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for   %rem = urem i32 %a, 8
+; CHECK-DAG: DemandedBits: 0x7 for %a in   %rem = urem i32 %a, 8
+; CHECK-DAG: DemandedBits: 0xffffffff for 8 in   %rem = urem i32 %a, 8
+;
+  %rem = urem i32 %a, 8
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
+
+define i8 @test_urem_const_amount_9(i32 %a) {
+; CHECK-LABEL: 'test_urem_const_amount_9'
+; CHECK-DAG: DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for   %rem = urem i32 %a, 9
+; CHECK-DAG: DemandedBits: 0xfff for %a in   %rem = urem i32 %a, 9
+; CHECK-DAG: DemandedBits: 0xffffffff for 9 in   %rem = urem i32 %a, 9
+;
+  %rem = urem i32 %a, 9
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
+
+define i8 @test_urem(i32 %a, i32 %b) {
+; CHECK-LABEL: 'test_urem'
+; CHECK-DAG: DemandedBits: 0xff for   %rem = urem i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xffffffff for %a in   %rem = urem i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xffffffff for %b in   %rem = urem i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xff for   %rem.t = trunc i32 %rem to i8
+; CHECK-DAG: DemandedBits: 0xff for %rem in   %rem.t = trunc i32 %rem to i8
+;
+  %rem = urem i32 %a, %b
+  %rem.t = trunc i32 %rem to i8
+  ret i8 %rem.t
+}
diff --git a/llvm/test/Analysis/DemandedBits/shl.ll b/llvm/test/Analysis/DemandedBits/shl.ll
index e41f5f4107735..c3313a93c1e85 100644
--- a/llvm/test/Analysis/DemandedBits/shl.ll
+++ b/llvm/test/Analysis/DemandedBits/shl.ll
@@ -57,10 +57,56 @@ define i8 @test_shl(i32 %a, i32 %b) {
 ; CHECK-DAG:  DemandedBits: 0xff for %shl.t = trunc i32 %shl to i8
 ; CHECK-DAG:  DemandedBits: 0xff for %shl in %shl.t = trunc i32 %shl to i8
 ; CHECK-DAG:  DemandedBits: 0xff for %shl = shl i32 %a, %b
-; CHECK-DAG:  DemandedBits: 0xffffffff for %a in %shl = shl i32 %a, %b
+; CHECK-DAG:  DemandedBits: 0xff for %a in %shl = shl i32 %a, %b
 ; CHECK-DAG:  DemandedBits: 0xffffffff for %b in %shl = shl i32 %a, %b
 ;
   %shl = shl i32 %a, %b
   %shl.t = trunc i32 %shl to i8
   ret i8 %shl.t
 }
+
+define i8 @test_shl_var_amount(i32 %a, i32 %b){
+; CHECK-LABEL: 'test_shl_var_amount'
+; CHECK-DAG: DemandedBits: 0xff for   %5 = trunc i32 %4 to i8
+; CHECK-DAG: DemandedBits: 0xff for %4 in   %5 = trunc i32 %4 to i8
+; CHECK-DAG: DemandedBits: 0xff for   %4 = shl i32 %1, %3
+; CHECK-DAG: DemandedBits: 0xff for %1 in   %4 = shl i32 %1, %3
+; CHECK-DAG: DemandedBits: 0xffffffff for %3 in   %4 = shl i32 %1, %3
+; CHECK-DAG: DemandedBits: 0xff for   %2 = trunc i32 %1 to i8
+; CHECK-DAG: DemandedBits: 0xff for %1 in   %2 = trunc i32 %1 to i8
+; CHECK-DAG: DemandedBits: 0xffffffff for   %3 = zext i8 %2 to i32
+; CHECK-DAG: DemandedBits: 0xff for %2 in   %3 = zext i8 %2 to i32
+; CHECK-DAG: DemandedBits: 0xff for   %1 = add nsw i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xff for %a in   %1 = add nsw i32 %a, %b
+; CHECK-DAG: DemandedBits: 0xff for %b in   %1 = add nsw i32 %a, %b
+;
+  %1 = add nsw i32 %a, %b
+  %2 = trunc i32 %1 to i8
+  %3 = zext i8 %2 to i32
+  %4 = shl i32 %1, %3
+  %5 = trunc i32 %4 to i8
+  ret i8 %5
+}
+
+define i8 @test_shl_var_amount_nsw(i32 %a, i32 %b){
+ ; CHECK-LABEL 'test_shl_var_amount_nsw'
+ ; CHECK-DAG: DemandedBits: 0xff for   %5 = trunc i32 %4 to i8
+ ; CHECK-DAG: DemandedBits: 0xff for %4 in   %5 = trunc i32 %4 to i8
+ ; CHECK-DAG: DemandedBits: 0xff for   %4 = shl nsw i32 %1, %3
+ ; CHECK-DAG: DemandedBits: 0xffffffff for %1 in   %4 = shl nsw i32 %1, %3
+ ; CHECK-DAG: DemandedBits: 0xffffffff for %3 in   %4 = shl nsw i32 %1, %3
+ ; CHECK-DAG: DemandedBits: 0xffffffff for   %3 = zext i8 %2 to i32
+ ; CHECK-DAG: DemandedBits: 0xff for %2 in   %3 = zext i8 %2 to i32
+ ; CHECK-DAG: DemandedBits: 0xff for   %2 = trunc i32 %1 to i8
+ ; CHECK-DAG: DemandedBits: 0xff for %1 in   %2 = trunc i32 %1 to i8
+ ; CHECK-DAG: DemandedBits: 0xffffffff for   %1 = add nsw i32 %a, %b
+ ; CHECK-DAG: DemandedBits: 0xffffffff for %a in   %1 = add nsw i32 %a, %b
+ ; CHECK-DAG: DemandedBits: 0xffffffff for %b in   %1 = add nsw i32 %a, %b
+ ;
+  %1 = add nsw i32 %a, %b
+  %2 = trunc i32 %1 to i8
+  %3 = zext i8 %2 to i32
+  %4 = shl nsw i32 %1, %3
+  %5 = trunc i32 %4 to i8
+  ret i8 %5
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index dab14280a6b71..307659937b0bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -43,18 +43,18 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP24]]
 ; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP22]], align 1
 ; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP25]], align 1
-; DEFAULT-NEXT:    [[TMP26:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
-; DEFAULT-NEXT:    [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
-; DEFAULT-NEXT:    [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
-; DEFAULT-NEXT:    [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
-; DEFAULT-NEXT:    [[TMP30:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
-; DEFAULT-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
-; DEFAULT-NEXT:    [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP30]]
-; DEFAULT-NEXT:    [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP31]]
-; DEFAULT-NEXT:    [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], splat (i16 1)
-; DEFAULT-NEXT:    [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], splat (i16 1)
-; DEFAULT-NEXT:    [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
-; DEFAULT-NEXT:    [[TMP37:%.*]] = trunc <vscale x 8 x i16> [[TMP35]] to <vscale x 8 x i8>
+; DEFAULT-NEXT:    [[TMP17:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP19:%.*]] = mul <vscale x 8 x i16> [[TMP17]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP27:%.*]] = mul <vscale x 8 x i16> [[TMP18]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP28:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP29:%.*]] = or <vscale x 8 x i16> [[TMP19]], [[TMP21]]
+; DEFAULT-NEXT:    [[TMP30:%.*]] = or <vscale x 8 x i16> [[TMP27]], [[TMP28]]
+; DEFAULT-NEXT:    [[TMP31:%.*]] = lshr <vscale x 8 x i16> [[TMP29]], splat (i16 1)
+; DEFAULT-NEXT:    [[TMP26:%.*]] = lshr <vscale x 8 x i16> [[TMP30]], splat (i16 1)
+; DEFAULT-NEXT:    [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP31]] to <vscale x 8 x i8>
+; DEFAULT-NEXT:    [[TMP37:%.*]] = trunc <vscale x 8 x i16> [[TMP26]] to <vscale x 8 x i8>
 ; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[TMP38]], i32 0
 ; DEFAULT-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
@@ -128,19 +128,19 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
 ; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
-; PRED-NEXT:    [[TMP22:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
+; PRED-NEXT:    [[TMP23:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
 ; PRED-NEXT:    [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
-; PRED-NEXT:    [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP24]]
+; PRED-NEXT:    [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP23]], [[TMP24]]
 ; PRED-NEXT:    [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
-; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
+; PRED-NEXT:    [[TMP22:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP22]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
-; PRED-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED-NEXT:    [[TMP25:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; PRED-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 16 x i1> [[TMP25]], i32 0
+; PRED-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
index 4b8ff86774681..2eac938e9a8e8 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
@@ -77,14 +77,14 @@ define void @trunc_minimal_bitwidths_shufflevector (ptr %p, i32 %arg1, i64 %len)
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[ARG1:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT]] to <vscale x 4 x i8>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 4 x i8> [[WIDE_LOAD]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i8> [[TMP6]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <vscale x 4 x i8> [[WIDE_LOAD]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i8> [[TMP12]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <vscale x 4 x i8> [[TMP7]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]