[llvm] [Reland][ValueTracking] Improve Bitcast handling to match SDAG (PR #145223)

Sun Aug 3 03:41:17 PDT 2025

https://github.com/abhishek-kaushik22 updated https://github.com/llvm/llvm-project/pull/145223

>From 30334ac01be4fca5627edeeadfd4a4a8f47ef639 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Sun, 22 Jun 2025 14:15:03 +0530
Subject: [PATCH 1/5] [Reland][ValueTracking] Improve Bitcast handling to match
 SDAG

---
 llvm/lib/Analysis/ValueTracking.cpp           | 27 +++++++++++++++++--
 .../InstCombine/X86/x86-vector-shifts.ll      |  4 +--
 .../InstCombine/bitcast-known-bits.ll         | 21 +++++----------
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index a17417cb5189c..2c4d55eea1dda 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1346,6 +1346,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
         isa<ScalableVectorType>(I->getType()))
       break;
 
+    unsigned NumElts = DemandedElts.getBitWidth();
+    bool IsLE = Q.DL.isLittleEndian();
     // Look through a cast from narrow vector elements to wider type.
     // Examples: v4i32 -> v2i64, v3i8 -> v24
     unsigned SubBitWidth = SrcVecTy->getScalarSizeInBits();
@@ -1364,7 +1366,6 @@ static void computeKnownBitsFromOperator(const Operator *I,
       //
       // The known bits of each sub-element are then inserted into place
       // (dependent on endian) to form the full result of known bits.
-      unsigned NumElts = DemandedElts.getBitWidth();
       unsigned SubScale = BitWidth / SubBitWidth;
       APInt SubDemandedElts = APInt::getZero(NumElts * SubScale);
       for (unsigned i = 0; i != NumElts; ++i) {
@@ -1376,10 +1377,32 @@ static void computeKnownBitsFromOperator(const Operator *I,
       for (unsigned i = 0; i != SubScale; ++i) {
         computeKnownBits(I->getOperand(0), SubDemandedElts.shl(i), KnownSrc, Q,
                          Depth + 1);
-        unsigned ShiftElt = Q.DL.isLittleEndian() ? i : SubScale - 1 - i;
+        unsigned ShiftElt = IsLE ? i : SubScale - 1 - i;
         Known.insertBits(KnownSrc, ShiftElt * SubBitWidth);
       }
     }
+    // Look through a cast from wider vector elements to narrow type.
+    // Examples: v2i64 -> v4i32
+    if (SubBitWidth % BitWidth == 0) {
+      unsigned SubScale = SubBitWidth / BitWidth;
+      KnownBits KnownSrc(SubBitWidth);
+      APInt SubDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
+      computeKnownBits(I->getOperand(0), SubDemandedElts, KnownSrc, Q,
+                       Depth + 1);
+
+      Known.Zero.setAllBits();
+      Known.One.setAllBits();
+      for (unsigned i = 0; i != NumElts; ++i) {
+        if (DemandedElts[i]) {
+          unsigned Shifts = IsLE ? i : NumElts - 1 - i;
+          unsigned Offset = (Shifts % SubScale) * BitWidth;
+          Known = Known.intersectWith(KnownSrc.extractBits(BitWidth, Offset));
+          if (Known.isUnknown())
+            break;
+        }
+      }
+    }
     break;
   }
   case Instruction::SExt: {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index db56080a3ea2b..cc252ae53803b 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -3732,7 +3732,6 @@ define <4 x i64> @test_avx2_psrl_0() {
   ret <4 x i64> %16
 }
 
-; FIXME: Failure to peek through bitcasts to ensure psllq shift amount is within bounds.
 define <2 x i64> @PR125228(<2 x i64> %v, <2 x i64> %s) {
 ; CHECK-LABEL: @PR125228(
 ; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i64> [[S:%.*]], splat (i64 63)
@@ -3741,7 +3740,8 @@ define <2 x i64> @PR125228(<2 x i64> %v, <2 x i64> %s) {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[MASK]] to <16 x i8>
 ; CHECK-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[CAST3:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
-; CHECK-NEXT:    [[SLL1:%.*]] = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> [[V]], <2 x i64> [[CAST3]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[CAST3]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SLL1:%.*]] = shl <2 x i64> [[V]], [[TMP2]]
 ; CHECK-NEXT:    [[SHUFP_UNCASTED:%.*]] = shufflevector <2 x i64> [[SLL0]], <2 x i64> [[SLL1]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    ret <2 x i64> [[SHUFP_UNCASTED]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll b/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
index 3e47e775e3a28..65b43df752f76 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
@@ -12,8 +12,7 @@ define <16 x i8> @knownbits_bitcast_masked_shift(<16 x i8> %arg1, <16 x i8> %arg
 ; CHECK-NEXT:    [[BITCAST4:%.*]] = bitcast <16 x i8> [[OR]] to <8 x i16>
 ; CHECK-NEXT:    [[SHL5:%.*]] = shl nuw <8 x i16> [[BITCAST4]], splat (i16 2)
 ; CHECK-NEXT:    [[BITCAST6:%.*]] = bitcast <8 x i16> [[SHL5]] to <16 x i8>
-; CHECK-NEXT:    [[AND7:%.*]] = and <16 x i8> [[BITCAST6]], splat (i8 -52)
-; CHECK-NEXT:    ret <16 x i8> [[AND7]]
+; CHECK-NEXT:    ret <16 x i8> [[BITCAST6]]
 ;
   %and = and <16 x i8> %arg1, splat (i8 3)
   %and3 = and <16 x i8> %arg2, splat (i8 48)
@@ -33,8 +32,7 @@ define <16 x i8> @knownbits_shuffle_masked_nibble_shift(<16 x i8> %arg)  {
 ; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <16 x i8> [[SHUFFLEVECTOR]] to <8 x i16>
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <8 x i16> [[BITCAST1]], splat (i16 4)
 ; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast <8 x i16> [[SHL]] to <16 x i8>
-; CHECK-NEXT:    [[AND3:%.*]] = and <16 x i8> [[BITCAST2]], splat (i8 -16)
-; CHECK-NEXT:    ret <16 x i8> [[AND3]]
+; CHECK-NEXT:    ret <16 x i8> [[BITCAST2]]
 ;
   %and = and <16 x i8> %arg, splat (i8 15)
   %shufflevector = shufflevector <16 x i8> %and, <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -53,8 +51,7 @@ define <16 x i8> @knownbits_reverse_shuffle_masked_shift(<16 x i8> %arg)  {
 ; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <16 x i8> [[SHUFFLEVECTOR]] to <8 x i16>
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <8 x i16> [[BITCAST1]], splat (i16 4)
 ; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast <8 x i16> [[SHL]] to <16 x i8>
-; CHECK-NEXT:    [[AND3:%.*]] = and <16 x i8> [[BITCAST2]], splat (i8 -16)
-; CHECK-NEXT:    ret <16 x i8> [[AND3]]
+; CHECK-NEXT:    ret <16 x i8> [[BITCAST2]]
 ;
   %and = and <16 x i8> %arg, splat (i8 15)
   %shufflevector = shufflevector <16 x i8> %and, <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
@@ -70,8 +67,7 @@ define <16 x i8> @knownbits_extract_bit(<8 x i16> %arg)  {
 ; CHECK-SAME: <8 x i16> [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <8 x i16> [[ARG]], splat (i16 15)
 ; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <8 x i16> [[LSHR]] to <16 x i8>
-; CHECK-NEXT:    [[AND:%.*]] = and <16 x i8> [[BITCAST1]], splat (i8 1)
-; CHECK-NEXT:    ret <16 x i8> [[AND]]
+; CHECK-NEXT:    ret <16 x i8> [[BITCAST1]]
 ;
   %lshr = lshr <8 x i16> %arg, splat (i16 15)
   %bitcast1 = bitcast <8 x i16> %lshr to <16 x i8>
@@ -88,7 +84,8 @@ define { i32, i1 } @knownbits_popcount_add_with_overflow(<2 x i64> %arg1, <2 x i
 ; CHECK-NEXT:    [[CALL9:%.*]] = tail call range(i64 0, 65) <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[ARG2]])
 ; CHECK-NEXT:    [[BITCAST10:%.*]] = bitcast <2 x i64> [[CALL9]] to <4 x i32>
 ; CHECK-NEXT:    [[EXTRACTELEMENT11:%.*]] = extractelement <4 x i32> [[BITCAST10]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[EXTRACTELEMENT]], i32 [[EXTRACTELEMENT11]])
+; CHECK-NEXT:    [[CALL12:%.*]] = add nuw nsw i32 [[EXTRACTELEMENT]], [[EXTRACTELEMENT11]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { i32, i1 } { i32 poison, i1 false }, i32 [[CALL12]], 0
 ; CHECK-NEXT:    ret { i32, i1 } [[TMP1]]
 ;
   %call = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %arg1)
@@ -110,11 +107,7 @@ define <16 x i8> @knownbits_shuffle_add_shift_v32i8(<16 x i8> %arg1, <8 x i16> %
 ; CHECK-NEXT:    [[BITCAST11:%.*]] = bitcast <8 x i16> [[SHL10]] to <16 x i8>
 ; CHECK-NEXT:    [[ADD12:%.*]] = add <16 x i8> [[BITCAST11]], [[BITCAST7]]
 ; CHECK-NEXT:    [[ADD14:%.*]] = add <16 x i8> [[ADD12]], [[ARG1]]
-; CHECK-NEXT:    [[BITCAST14:%.*]] = bitcast <16 x i8> [[ADD12]] to <8 x i16>
-; CHECK-NEXT:    [[SHL15:%.*]] = shl <8 x i16> [[BITCAST14]], splat (i16 8)
-; CHECK-NEXT:    [[BITCAST16:%.*]] = bitcast <8 x i16> [[SHL15]] to <16 x i8>
-; CHECK-NEXT:    [[ADD13:%.*]] = add <16 x i8> [[ADD14]], [[BITCAST16]]
-; CHECK-NEXT:    ret <16 x i8> [[ADD13]]
+; CHECK-NEXT:    ret <16 x i8> [[ADD14]]
 ;
   %shl6 = shl <8 x i16> %arg2, splat (i16 8)
   %bitcast7 = bitcast <8 x i16> %shl6 to <16 x i8>

>From b9521a6c90f6d4cc36bc382b9af1b42105648a75 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Thu, 26 Jun 2025 23:24:40 +0530
Subject: [PATCH 2/5] Add miscompile test

---
 .../Transforms/Inline/bitcast-knownbits.ll    | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 llvm/test/Transforms/Inline/bitcast-knownbits.ll

diff --git a/llvm/test/Transforms/Inline/bitcast-knownbits.ll b/llvm/test/Transforms/Inline/bitcast-knownbits.ll
new file mode 100644
index 0000000000000..07c677126c039
--- /dev/null
+++ b/llvm/test/Transforms/Inline/bitcast-knownbits.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=always-inline -S | FileCheck %s
+
+define <2 x i64> @vpx_lpf_horizontal_4_sse2(<2 x i64> %0) {
+; CHECK-LABEL: define <2 x i64> @vpx_lpf_horizontal_4_sse2(
+; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <8 x i16> [[TMP5]], splat (i16 1)
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %2 = call fastcc <2 x i64> @_mm_unpackhi_epi8(<2 x i64> %0)
+  %3 = call fastcc <2 x i64> @_mm_srai_epi16(<2 x i64> %2)
+  ret <2 x i64> %3
+}
+
+define fastcc <2 x i64> @_mm_unpackhi_epi8(<2 x i64> %0) #0 {
+; CHECK-LABEL: define fastcc <2 x i64> @_mm_unpackhi_epi8(
+; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %2 = bitcast <2 x i64> %0 to <16 x i8>
+  %3 = shufflevector <16 x i8> %2, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %4 = bitcast <16 x i8> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+define fastcc <2 x i64> @_mm_srai_epi16(<2 x i64> %0) #0 {
+; CHECK-LABEL: define fastcc <2 x i64> @_mm_srai_epi16(
+; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <8 x i16> [[TMP2]], splat (i16 1)
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %2 = bitcast <2 x i64> %0 to <8 x i16>
+  %3 = ashr <8 x i16> %2, splat (i16 1)
+  ret <2 x i64> zeroinitializer
+}
+
+attributes #0 = { alwaysinline }

>From 6e4ff4f689e8f610fee55dbe2d6a00ebf7017269 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Tue, 29 Jul 2025 22:04:23 +0530
Subject: [PATCH 3/5] Remove inline test and add instsimplify test

---
 .../Transforms/Inline/bitcast-knownbits.ll    | 45 -------------------
 .../InstSimplify/shift-knownbits.ll           | 32 +++++++++++++
 2 files changed, 32 insertions(+), 45 deletions(-)
 delete mode 100644 llvm/test/Transforms/Inline/bitcast-knownbits.ll

diff --git a/llvm/test/Transforms/Inline/bitcast-knownbits.ll b/llvm/test/Transforms/Inline/bitcast-knownbits.ll
deleted file mode 100644
index 07c677126c039..0000000000000
--- a/llvm/test/Transforms/Inline/bitcast-knownbits.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=always-inline -S | FileCheck %s
-
-define <2 x i64> @vpx_lpf_horizontal_4_sse2(<2 x i64> %0) {
-; CHECK-LABEL: define <2 x i64> @vpx_lpf_horizontal_4_sse2(
-; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) {
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = ashr <8 x i16> [[TMP5]], splat (i16 1)
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %2 = call fastcc <2 x i64> @_mm_unpackhi_epi8(<2 x i64> %0)
-  %3 = call fastcc <2 x i64> @_mm_srai_epi16(<2 x i64> %2)
-  ret <2 x i64> %3
-}
-
-define fastcc <2 x i64> @_mm_unpackhi_epi8(<2 x i64> %0) #0 {
-; CHECK-LABEL: define fastcc <2 x i64> @_mm_unpackhi_epi8(
-; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
-;
-  %2 = bitcast <2 x i64> %0 to <16 x i8>
-  %3 = shufflevector <16 x i8> %2, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  %4 = bitcast <16 x i8> %3 to <2 x i64>
-  ret <2 x i64> %4
-}
-
-define fastcc <2 x i64> @_mm_srai_epi16(<2 x i64> %0) #0 {
-; CHECK-LABEL: define fastcc <2 x i64> @_mm_srai_epi16(
-; CHECK-SAME: <2 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = ashr <8 x i16> [[TMP2]], splat (i16 1)
-; CHECK-NEXT:    ret <2 x i64> zeroinitializer
-;
-  %2 = bitcast <2 x i64> %0 to <8 x i16>
-  %3 = ashr <8 x i16> %2, splat (i16 1)
-  ret <2 x i64> zeroinitializer
-}
-
-attributes #0 = { alwaysinline }
diff --git a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
index 3917172e3b752..c1316041d3d08 100644
--- a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
+++ b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
@@ -499,3 +499,35 @@ define <1 x i64> @bitcast_noshift_vector_wrong_type(<2 x float> %v1, <1 x i64> %
   %r = shl <1 x i64> %v2, %b
   ret <1 x i64> %r
 }
+
+; Test that verifies correct handling of known bits when bitcasting from a smaller vector
+; to a larger one (e.g., <2 x i32> to <8 x i8>). Previously, only the subscale portion
+; (e.g., 4 elements) was checked instead of the full demanded vector width (8 elements),
+; leading to incorrect known bits and removal of the `ashr` instruction.
+
+define <8 x i8> @bitcast_knownbits_subscale_miscompile(i32 %x) {
+; CHECK-LABEL: @bitcast_knownbits_subscale_miscompile(
+; CHECK-NEXT:    [[MASKED:%.*]] = and i32 [[X:%.*]], -256
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32 [[MASKED]] to <4 x i8>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x i8> [[BITCAST]], i32 3
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[EXTRACT]], -113
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MASKED]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[INSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC:%.*]] = bitcast <2 x i32> [[SPLAT]] to <8 x i8>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <8 x i8> [[VEC]], <8 x i8> zeroinitializer, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 0, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    [[SHR:%.*]] = ashr <8 x i8> [[SHUF]], splat (i8 1)
+; CHECK-NEXT:    ret <8 x i8> [[SHR]]
+;
+  %masked = and i32 %x, u0xFFFFFF00
+  %bitcast = bitcast i32 %masked to <4 x i8>
+  %extract = extractelement <4 x i8> %bitcast, i32 3
+  %cond = icmp eq i8 %extract, u0x8F
+  call void @llvm.assume(i1 %cond)
+  %insert = insertelement <2 x i32> poison, i32 %masked, i32 0
+  %splat = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> splat (i32 0)
+  %vec = bitcast <2 x i32> %splat to <8 x i8>
+  %shuf = shufflevector <8 x i8> %vec, <8 x i8> zeroinitializer, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 0, i32 0, i32 0, i32 0>
+  %shr = ashr <8 x i8> %shuf, splat (i8 1)
+  ret <8 x i8> %shr
+}

>From 9c02704b06483508ea60f9596337d10f96a44f2a Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Sun, 3 Aug 2025 16:04:48 +0530
Subject: [PATCH 4/5] Update shift-knownbits.ll

---
 .../InstSimplify/shift-knownbits.ll           | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
index c1316041d3d08..68b13287e8c96 100644
--- a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
+++ b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
@@ -505,14 +505,16 @@ define <1 x i64> @bitcast_noshift_vector_wrong_type(<2 x float> %v1, <1 x i64> %
 ; (e.g., 4 elements) was checked instead of the full demanded vector width (8 elements),
 ; leading to incorrect known bits and removal of the `ashr` instruction.
 
+; Test that verifies correct handling of known bits when bitcasting from a smaller vector
+; to a larger one (e.g., <2 x i32> to <8 x i8>). Previously, only the subscale portion
+; (e.g., 4 elements) was checked instead of the full demanded vector width (8 elements),
+; leading to incorrect known bits and removal of the `ashr` instruction.
+
 define <8 x i8> @bitcast_knownbits_subscale_miscompile(i32 %x) {
 ; CHECK-LABEL: @bitcast_knownbits_subscale_miscompile(
 ; CHECK-NEXT:    [[MASKED:%.*]] = and i32 [[X:%.*]], -256
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32 [[MASKED]] to <4 x i8>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x i8> [[BITCAST]], i32 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[EXTRACT]], -113
-; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
-; CHECK-NEXT:    [[INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MASKED]], i32 0
+; CHECK-NEXT:    [[SETBITS:%.*]] = or i32 [[MASKED]], -16777216
+; CHECK-NEXT:    [[INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[SETBITS]], i32 0
 ; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x i32> [[INSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC:%.*]] = bitcast <2 x i32> [[SPLAT]] to <8 x i8>
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <8 x i8> [[VEC]], <8 x i8> zeroinitializer, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 0, i32 0, i32 0, i32 0>
@@ -520,11 +522,8 @@ define <8 x i8> @bitcast_knownbits_subscale_miscompile(i32 %x) {
 ; CHECK-NEXT:    ret <8 x i8> [[SHR]]
 ;
   %masked = and i32 %x, u0xFFFFFF00
-  %bitcast = bitcast i32 %masked to <4 x i8>
-  %extract = extractelement <4 x i8> %bitcast, i32 3
-  %cond = icmp eq i8 %extract, u0x8F
-  call void @llvm.assume(i1 %cond)
-  %insert = insertelement <2 x i32> poison, i32 %masked, i32 0
+  %setbits = or i32 %masked, u0xFF000000
+  %insert = insertelement <2 x i32> poison, i32 %setbits, i32 0
   %splat = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> splat (i32 0)
   %vec = bitcast <2 x i32> %splat to <8 x i8>
   %shuf = shufflevector <8 x i8> %vec, <8 x i8> zeroinitializer, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 0, i32 0, i32 0, i32 0>

>From ac4766b81f0a48b090fd6b5d6baf3c30347affa8 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik at intel.com>
Date: Sun, 3 Aug 2025 16:08:49 +0530
Subject: [PATCH 5/5] Update shift-knownbits.ll

---
 llvm/test/Transforms/InstSimplify/shift-knownbits.ll | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
index 68b13287e8c96..940a41bf6483f 100644
--- a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
+++ b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
@@ -505,11 +505,6 @@ define <1 x i64> @bitcast_noshift_vector_wrong_type(<2 x float> %v1, <1 x i64> %
 ; (e.g., 4 elements) was checked instead of the full demanded vector width (8 elements),
 ; leading to incorrect known bits and removal of the `ashr` instruction.
 
-; Test that verifies correct handling of known bits when bitcasting from a smaller vector
-; to a larger one (e.g., <2 x i32> to <8 x i8>). Previously, only the subscale portion
-; (e.g., 4 elements) was checked instead of the full demanded vector width (8 elements),
-; leading to incorrect known bits and removal of the `ashr` instruction.
-
 define <8 x i8> @bitcast_knownbits_subscale_miscompile(i32 %x) {
 ; CHECK-LABEL: @bitcast_knownbits_subscale_miscompile(
 ; CHECK-NEXT:    [[MASKED:%.*]] = and i32 [[X:%.*]], -256