[llvm] [SelectionDAG] Add computeKnownBits support for ISD::STEP_VECTOR (PR #80452)

Fri Feb 2 10:36:29 PST 2024

https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/80452

>From ff14dd1585f2123a2b53851c66c8a268212eadfa Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 2 Feb 2024 22:26:02 +0700
Subject: [PATCH 1/4] Add tests where we should be able to know the lo or hi
 bits are zero

---
 llvm/test/CodeGen/RISCV/rvv/stepvector.ll | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index 6ce307146be19..f3ef9c39b2093 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -733,3 +733,32 @@ entry:
   %3 = shl <vscale x 16 x i64> %2, %1
   ret <vscale x 16 x i64> %3
 }
+
+; maximum step is 4 * 2 = 8, so maximum step value is 7, so hi 61 bits are known
+; zero
+define <vscale x 2 x i64> @hi_bits_known_zero() vscale_range(2, 4) {
+; CHECK-LABEL: hi_bits_known_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vand.vi v8, v8, -8
+; CHECK-NEXT:    ret
+  %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+  %and = and <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xfffffffffffffff8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  ret <vscale x 2 x i64> %and
+}
+
+; step values are multiple of 8, so lo 3 bits are known zero
+define <vscale x 2 x i64> @lo_bits_known_zero() {
+; CHECK-LABEL: lo_bits_known_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsll.vi v8, v8, 3
+; CHECK-NEXT:    vand.vi v8, v8, 7
+; CHECK-NEXT:    ret
+  %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+  %step.mul = mul <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %and = and <vscale x 2 x i64> %step.mul, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 7, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  ret <vscale x 2 x i64> %and
+}

>From bb72363f6b2287808bc869dda685cef32fe0d5c8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 2 Feb 2024 22:29:03 +0700
Subject: [PATCH 2/4] [SelectionDAG] Add computeKnownBits support for
 ISD::STEP_VECTOR

This handles two cases where we can work out some known-zero bits for
ISD::STEP_VECTOR.

The first case handles when we know the low bits are zero because the step
amount is a power of two. This is taken from https://reviews.llvm.org/D128159,
and even though the original patch didn't end up landing this case due to it
not having any test difference, I've included it here for completeness's sake.

The second case handles the case when we have an upper bound on vscale_range.
We can use this to work out the upper bound on the number of elements, and thus
what the maximum step will be. From the maximum step we then know which hi bits
are zero.

On its own, computing the known hi bits results in some small improvements for
RVV with -mrvv-vector-bits=zvl across the llvm-test-suite. However I'm hoping
to be able to use this later to reduce the LMUL in index calculations for
vrgather/indexed accesses.

Co-authored-by: Philip Reames <preames at rivosinc.com>
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 13 +++++++++++++
 llvm/test/CodeGen/RISCV/rvv/stepvector.ll      |  7 ++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3c1343836187a..b20230ff96f4e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3110,6 +3110,19 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     }
     break;
   }
+  case ISD::STEP_VECTOR: {
+    const APInt &Step = Op.getConstantOperandAPInt(0);
+
+    if (Step.isPowerOf2())
+      Known.Zero.setLowBits(Step.logBase2());
+
+    const Function &F = getMachineFunction().getFunction();
+    const APInt MaxNumElts = getVScaleRange(&F, BitWidth).getUnsignedMax() *
+                             Op.getValueType().getVectorMinNumElements();
+    const APInt MaxValue = (MaxNumElts - 1) * Step;
+    Known.Zero.setHighBits(MaxValue.countl_zero());
+    break;
+  }
   case ISD::BUILD_VECTOR:
     assert(!Op.getValueType().isScalableVector());
     // Collect the known bits that are shared by every demanded vector element.
diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index f3ef9c39b2093..d18b45105cff7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -740,8 +740,7 @@ define <vscale x 2 x i64> @hi_bits_known_zero() vscale_range(2, 4) {
 ; CHECK-LABEL: hi_bits_known_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vand.vi v8, v8, -8
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    ret
   %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
   %and = and <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xfffffffffffffff8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
@@ -753,9 +752,7 @@ define <vscale x 2 x i64> @lo_bits_known_zero() {
 ; CHECK-LABEL: lo_bits_known_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vsll.vi v8, v8, 3
-; CHECK-NEXT:    vand.vi v8, v8, 7
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    ret
   %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
   %step.mul = mul <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)

>From 40012cf08acdb0c827fbcc1a4039bdf67e84f9ce Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 3 Feb 2024 00:34:23 +0700
Subject: [PATCH 3/4] Check for overflow

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  6 ++++--
 llvm/test/CodeGen/RISCV/rvv/stepvector.ll      | 16 ++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b20230ff96f4e..68b9422233902 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3119,8 +3119,10 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     const Function &F = getMachineFunction().getFunction();
     const APInt MaxNumElts = getVScaleRange(&F, BitWidth).getUnsignedMax() *
                              Op.getValueType().getVectorMinNumElements();
-    const APInt MaxValue = (MaxNumElts - 1) * Step;
-    Known.Zero.setHighBits(MaxValue.countl_zero());
+    bool Overflow;
+    const APInt MaxValue = (MaxNumElts - 1).umul_ov(Step, Overflow);
+    if (!Overflow)
+      Known.Zero.setHighBits(MaxValue.countl_zero());
     break;
   }
   case ISD::BUILD_VECTOR:
diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index d18b45105cff7..2d65c9d178b78 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -747,6 +747,22 @@ define <vscale x 2 x i64> @hi_bits_known_zero() vscale_range(2, 4) {
   ret <vscale x 2 x i64> %and
 }
 
+; the maximum step here overflows so don't set the known hi bits
+define <vscale x 2 x i64> @hi_bits_known_zero_overflow() vscale_range(2, 4) {
+; CHECK-LABEL: hi_bits_known_zero_overflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    vand.vi v8, v8, -8
+; CHECK-NEXT:    ret
+  %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+  %step.mul = mul <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xffffffffffffffff, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %and = and <vscale x 2 x i64> %step.mul, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xfffffffffffffff8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  ret <vscale x 2 x i64> %and
+}
+
 ; step values are multiple of 8, so lo 3 bits are known zero
 define <vscale x 2 x i64> @lo_bits_known_zero() {
 ; CHECK-LABEL: lo_bits_known_zero:

>From d4721263fc0d5d29974799bdc004876758ca8a1e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 3 Feb 2024 01:36:07 +0700
Subject: [PATCH 4/4] Check for overflow in the max num of elements too

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 68b9422233902..6f56ffcad128e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3117,9 +3117,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known.Zero.setLowBits(Step.logBase2());
 
     const Function &F = getMachineFunction().getFunction();
-    const APInt MaxNumElts = getVScaleRange(&F, BitWidth).getUnsignedMax() *
-                             Op.getValueType().getVectorMinNumElements();
     bool Overflow;
+    const APInt MinNumElts =
+        APInt(BitWidth, Op.getValueType().getVectorMinNumElements());
+    const APInt MaxNumElts = getVScaleRange(&F, BitWidth)
+                                 .getUnsignedMax()
+                                 .umul_ov(MinNumElts, Overflow);
     const APInt MaxValue = (MaxNumElts - 1).umul_ov(Step, Overflow);
     if (!Overflow)
       Known.Zero.setHighBits(MaxValue.countl_zero());