[llvm] [ValueTracking] Improve `Bitcast` handling to match SDAG (PR #125935)

Thu Feb 6 07:53:40 PST 2025

https://github.com/abhishek-kaushik22 updated https://github.com/llvm/llvm-project/pull/125935

>From 836d9f44d1131b428f54424571d683797ebb927b Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Thu, 6 Feb 2025 02:55:03 +0530
Subject: [PATCH 1/3] [ValueTracking] Improve `Bitcast` handling to match SDAG

Closes #125228
---
 llvm/lib/Analysis/ValueTracking.cpp | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 6eba6c0f08c3f40..e92d1aa0976e481 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1304,6 +1304,9 @@ static void computeKnownBitsFromOperator(const Operator *I,
     // Look through a cast from narrow vector elements to wider type.
     // Examples: v4i32 -> v2i64, v3i8 -> v24
     unsigned SubBitWidth = SrcVecTy->getScalarSizeInBits();
+    unsigned NumElts = DemandedElts.getBitWidth();
+    unsigned SubScale = BitWidth / SubBitWidth;
+    bool isLE = Q.DL.isLittleEndian();
     if (BitWidth % SubBitWidth == 0) {
       // Known bits are automatically intersected across demanded elements of a
       // vector. So for example, if a bit is computed as known zero, it must be
@@ -1319,8 +1322,6 @@ static void computeKnownBitsFromOperator(const Operator *I,
       //
       // The known bits of each sub-element are then inserted into place
       // (dependent on endian) to form the full result of known bits.
-      unsigned NumElts = DemandedElts.getBitWidth();
-      unsigned SubScale = BitWidth / SubBitWidth;
       APInt SubDemandedElts = APInt::getZero(NumElts * SubScale);
       for (unsigned i = 0; i != NumElts; ++i) {
         if (DemandedElts[i])
@@ -1331,10 +1332,30 @@ static void computeKnownBitsFromOperator(const Operator *I,
       for (unsigned i = 0; i != SubScale; ++i) {
         computeKnownBits(I->getOperand(0), SubDemandedElts.shl(i), KnownSrc,
                          Depth + 1, Q);
-        unsigned ShiftElt = Q.DL.isLittleEndian() ? i : SubScale - 1 - i;
+        unsigned ShiftElt = isLE ? i : SubScale - 1 - i;
         Known.insertBits(KnownSrc, ShiftElt * SubBitWidth);
       }
     }
+
+    if (SubBitWidth % BitWidth == 0) {
+      KnownBits KnownSrc(SubBitWidth);
+      APInt SubDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
+      computeKnownBits(I->getOperand(0), SubDemandedElts, KnownSrc, Depth + 1,
+                       Q);
+
+      Known.Zero.setAllBits();
+      Known.One.setAllBits();
+      for (unsigned i = 0; i != SubScale; ++i) {
+        if (DemandedElts[i]) {
+          unsigned Shifts = isLE ? i : NumElts - 1 - i;
+          unsigned Offset = (Shifts % SubScale) * BitWidth;
+          Known = Known.intersectWith(KnownSrc.extractBits(BitWidth, Offset));
+          if (Known.isUnknown())
+            break;
+        }
+      }
+    }
     break;
   }
   case Instruction::SExt: {

>From e3a0e9e247565e9e13da52882f6f97325ef30627 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Thu, 6 Feb 2025 21:09:20 +0530
Subject: [PATCH 2/3] Minor change and tests added

---
 llvm/lib/Analysis/ValueTracking.cpp           | 11 ++++----
 .../InstCombine/X86/x86-vector-shifts.ll      | 28 +++++++++++++++++++
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index e92d1aa0976e481..1a2ed6d4a75f871 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1301,12 +1301,11 @@ static void computeKnownBitsFromOperator(const Operator *I,
         isa<ScalableVectorType>(I->getType()))
       break;
 
+    unsigned NumElts = DemandedElts.getBitWidth();
+    bool IsLE = Q.DL.isLittleEndian();
     // Look through a cast from narrow vector elements to wider type.
     // Examples: v4i32 -> v2i64, v3i8 -> v24
     unsigned SubBitWidth = SrcVecTy->getScalarSizeInBits();
-    unsigned NumElts = DemandedElts.getBitWidth();
-    unsigned SubScale = BitWidth / SubBitWidth;
-    bool isLE = Q.DL.isLittleEndian();
     if (BitWidth % SubBitWidth == 0) {
       // Known bits are automatically intersected across demanded elements of a
       // vector. So for example, if a bit is computed as known zero, it must be
@@ -1322,6 +1321,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       //
       // The known bits of each sub-element are then inserted into place
       // (dependent on endian) to form the full result of known bits.
+      unsigned SubScale = BitWidth / SubBitWidth;
       APInt SubDemandedElts = APInt::getZero(NumElts * SubScale);
       for (unsigned i = 0; i != NumElts; ++i) {
         if (DemandedElts[i])
@@ -1332,12 +1332,13 @@ static void computeKnownBitsFromOperator(const Operator *I,
       for (unsigned i = 0; i != SubScale; ++i) {
         computeKnownBits(I->getOperand(0), SubDemandedElts.shl(i), KnownSrc,
                          Depth + 1, Q);
-        unsigned ShiftElt = isLE ? i : SubScale - 1 - i;
+        unsigned ShiftElt = IsLE ? i : SubScale - 1 - i;
         Known.insertBits(KnownSrc, ShiftElt * SubBitWidth);
       }
     }
 
     if (SubBitWidth % BitWidth == 0) {
+      unsigned SubScale = SubBitWidth / BitWidth;
       KnownBits KnownSrc(SubBitWidth);
       APInt SubDemandedElts =
           APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
@@ -1348,7 +1349,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       Known.One.setAllBits();
       for (unsigned i = 0; i != SubScale; ++i) {
         if (DemandedElts[i]) {
-          unsigned Shifts = isLE ? i : NumElts - 1 - i;
+          unsigned Shifts = IsLE ? i : NumElts - 1 - i;
           unsigned Offset = (Shifts % SubScale) * BitWidth;
           Known = Known.intersectWith(KnownSrc.extractBits(BitWidth, Offset));
           if (Known.isUnknown())
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index d38e22a4e3851be..9fb4bbc557f3cb3 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -3732,6 +3732,34 @@ define <4 x i64> @test_avx2_psrl_0() {
   ret <4 x i64> %16
 }
 
+define <2 x i64> @pr125228(<2 x i64> %v, <2 x i64> %s) {
+; CHECK-LABEL: @pr125228(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i64> [[S:%.*]], splat (i64 63)
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i64> [[MASK]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SLL0:%.*]] = shl <2 x i64> [[V:%.*]], [[TMP0]]
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[MASK]] to <16 x i8>
+; CHECK-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[CAST3:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[CAST3]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SLL1:%.*]] = shl <2 x i64> [[V]], [[TMP1]]
+; CHECK-NEXT:    [[SHUFP_UNCASTED:%.*]] = shufflevector <2 x i64> [[SLL0]], <2 x i64> [[SLL1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x i64> [[SHUFP_UNCASTED]]
+;
+entry:
+  %mask = and <2 x i64> %s, splat (i64 63)
+  %sll0 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %mask)
+  %cast = bitcast <2 x i64> %mask to <16 x i8>
+  %psrldq = shufflevector <16 x i8> %cast, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %cast3 = bitcast <16 x i8> %psrldq to <2 x i64>
+  %sll1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %cast3)
+  %cast0 = bitcast <2 x i64> %sll0 to <2 x double>
+  %cast1 = bitcast <2 x i64> %sll1 to <2 x double>
+  %shufp = shufflevector <2 x double> %cast0, <2 x double> %cast1, <2 x i32> <i32 0, i32 3>
+  %res = bitcast <2 x double> %shufp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) #1
 declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) #1
 declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) #1

>From 941e0dd7df90ddf83abc9db43e3a0befc08408c5 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Thu, 6 Feb 2025 21:22:09 +0530
Subject: [PATCH 3/3] Add comment

---
 llvm/lib/Analysis/ValueTracking.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1a2ed6d4a75f871..4d08c3acd108048 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1336,7 +1336,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
         Known.insertBits(KnownSrc, ShiftElt * SubBitWidth);
       }
     }
-
+    // Look through a cast from wider vector elements to narrow type.
+    // Examples: v2i64 -> v4i32
     if (SubBitWidth % BitWidth == 0) {
       unsigned SubScale = SubBitWidth / BitWidth;
       KnownBits KnownSrc(SubBitWidth);