[llvm] [SDAG] Construct constants via instructions if materialization is costly (PR #86659)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 26 05:58:05 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag
Author: Wang Pengcheng (wangpc-pp)
<details>
<summary>Changes</summary>
For some targets like RISCV, it is costly to materialize constants
used in lowering `ISD::CTPOP`/`ISD::VP_CTPOP`.
We can query the materialization cost via `TargetTransformInfo::getIntImmCost`
and if the cost is larger than 2, we should construct the constant
via two instructions.
This fixes #<!-- -->86207.
---
Patch is 763.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/86659.diff
12 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+67-12)
- (modified) llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll (+144-144)
- (modified) llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll (+76-80)
- (modified) llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll (+1167-779)
- (modified) llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll (+184-192)
- (modified) llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll (+1246-888)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll (+2468-1788)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll (+52-88)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll (+1177-812)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll (+30-48)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll (+2548-1916)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll (+76-112)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8be03b66e155f6..566a76a09d783f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12,6 +12,7 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/CodeGenCommonISel.h"
@@ -8666,14 +8667,32 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
return SDValue();
+ const auto &TLI = DAG.getTargetLoweringInfo();
+ const auto &TTI = TLI.getTargetMachine().getTargetTransformInfo(
+ DAG.getMachineFunction().getFunction());
+ Type *VTTy = VT.getScalarType().getTypeForEVT(*DAG.getContext());
+
// This is the "best" algorithm from
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
- SDValue Mask55 =
- DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);
+ // 0x0F0F0F0F...
+ const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F));
+ SDValue Mask0F = DAG.getConstant(Constant0F, dl, VT);
+ // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2))
+ const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33));
SDValue Mask33 =
- DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, VT);
- SDValue Mask0F =
- DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, VT);
+ TTI.getIntImmCost(Constant33, VTTy, TargetTransformInfo::TCK_Latency) > 2
+ ? DAG.getNode(ISD::XOR, dl, VT, Mask0F,
+ DAG.getNode(ISD::SHL, dl, VT, Mask0F,
+ DAG.getShiftAmountConstant(2, VT, dl)))
+ : DAG.getConstant(Constant33, dl, VT);
+ // 0x55555555... = (0x33333333... ^ (0x33333333... << 1))
+ const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55));
+ SDValue Mask55 =
+ TTI.getIntImmCost(Constant55, VTTy, TargetTransformInfo::TCK_Latency) > 2
+ ? DAG.getNode(ISD::XOR, dl, VT, Mask33,
+ DAG.getNode(ISD::SHL, dl, VT, Mask33,
+ DAG.getShiftAmountConstant(1, VT, dl)))
+ : DAG.getConstant(Constant55, dl, VT);
// v = v - ((v >> 1) & 0x55555555...)
Op = DAG.getNode(ISD::SUB, dl, VT, Op,
@@ -8710,8 +8729,14 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
}
// v = (v * 0x01010101...) >> (Len - 8)
+ // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3))
+ const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01));
SDValue Mask01 =
- DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
+ TTI.getIntImmCost(Constant01, VTTy, TargetTransformInfo::TCK_Latency) > 2
+ ? DAG.getNode(ISD::AND, dl, VT, Mask0F,
+ DAG.getNode(ISD::SRL, dl, VT, Mask0F,
+ DAG.getShiftAmountConstant(3, VT, dl)))
+ : DAG.getConstant(Constant01, dl, VT);
return DAG.getNode(ISD::SRL, dl, VT,
DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
DAG.getConstant(Len - 8, dl, ShVT));
@@ -8731,14 +8756,36 @@ SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {
if (!(Len <= 128 && Len % 8 == 0))
return SDValue();
+ const auto &TLI = DAG.getTargetLoweringInfo();
+ const auto &TTI = TLI.getTargetMachine().getTargetTransformInfo(
+ DAG.getMachineFunction().getFunction());
+ Type *VTTy = VT.getScalarType().getTypeForEVT(*DAG.getContext());
+
// This is same algorithm of expandCTPOP from
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
- SDValue Mask55 =
- DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);
+ // 0x0F0F0F0F...
+ const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F));
+ SDValue Mask0F = DAG.getConstant(Constant0F, dl, VT);
+ // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2))
+ const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33));
SDValue Mask33 =
- DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, VT);
- SDValue Mask0F =
- DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, VT);
+ TTI.getIntImmCost(Constant33, VTTy, TargetTransformInfo::TCK_Latency) > 2
+ ? DAG.getNode(ISD::VP_XOR, dl, VT, Mask0F,
+ DAG.getNode(ISD::VP_SHL, dl, VT, Mask0F,
+ DAG.getShiftAmountConstant(2, VT, dl), Mask,
+ VL),
+ Mask, VL)
+ : DAG.getConstant(Constant33, dl, VT);
+ // 0x55555555... = (0x33333333... ^ (0x33333333... << 1))
+ const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55));
+ SDValue Mask55 =
+ TTI.getIntImmCost(Constant55, VTTy, TargetTransformInfo::TCK_Latency) > 2
+ ? DAG.getNode(ISD::VP_XOR, dl, VT, Mask33,
+ DAG.getNode(ISD::VP_SHL, dl, VT, Mask33,
+ DAG.getShiftAmountConstant(1, VT, dl), Mask,
+ VL),
+ Mask, VL)
+ : DAG.getConstant(Constant55, dl, VT);
SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5;
@@ -8767,8 +8814,16 @@ SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {
return Op;
// v = (v * 0x01010101...) >> (Len - 8)
+ // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3))
+ const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01));
SDValue Mask01 =
- DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
+ TTI.getIntImmCost(Constant01, VTTy, TargetTransformInfo::TCK_Latency) > 2
+ ? DAG.getNode(ISD::VP_AND, dl, VT, Mask0F,
+ DAG.getNode(ISD::VP_LSHR, dl, VT, Mask0F,
+ DAG.getShiftAmountConstant(3, VT, dl), Mask,
+ VL),
+ Mask, VL)
+ : DAG.getConstant(Constant01, dl, VT);
return DAG.getNode(ISD::VP_LSHR, dl, VT,
DAG.getNode(ISD::VP_MUL, dl, VT, Op, Mask01, Mask, VL),
DAG.getConstant(Len - 8, dl, ShVT), Mask, VL);
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index fc94f8c2a52797..e6f033937ec286 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -1147,30 +1147,24 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v9
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v10, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
+; RV32I-NEXT: vsll.vi v11, v10, 2
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: vadd.vv v12, v11, v11
+; RV32I-NEXT: vxor.vv v12, v11, v12
+; RV32I-NEXT: vand.vv v9, v9, v12
; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v11
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vand.vv v8, v8, v11
+; RV32I-NEXT: vadd.vv v8, v9, v8
; RV32I-NEXT: vsrl.vi v9, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vand.vv v8, v8, v10
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
@@ -1288,30 +1282,24 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v10
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v12, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
+; RV32I-NEXT: vsll.vi v14, v12, 2
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: vadd.vv v16, v14, v14
+; RV32I-NEXT: vxor.vv v16, v14, v16
+; RV32I-NEXT: vand.vv v10, v10, v16
; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v14
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vand.vv v8, v8, v14
+; RV32I-NEXT: vadd.vv v8, v10, v8
; RV32I-NEXT: vsrl.vi v10, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vand.vv v8, v8, v12
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
@@ -1429,30 +1417,24 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v12
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v12, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; RV32I-NEXT: vmv.v.x v16, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v12, v12, v16
+; RV32I-NEXT: vsll.vi v20, v16, 2
+; RV32I-NEXT: vxor.vv v20, v16, v20
+; RV32I-NEXT: vadd.vv v24, v20, v20
+; RV32I-NEXT: vxor.vv v24, v20, v24
+; RV32I-NEXT: vand.vv v12, v12, v24
; RV32I-NEXT: vsub.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v16, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v20
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vand.vv v8, v8, v20
+; RV32I-NEXT: vadd.vv v8, v12, v8
; RV32I-NEXT: vsrl.vi v12, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vand.vv v8, v8, v16
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
@@ -1554,6 +1536,12 @@ declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-LABEL: ctlz_nxv8i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: sub sp, sp, a0
+; RV32I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; RV32I-NEXT: vsrl.vi v16, v8, 1
; RV32I-NEXT: vor.vv v8, v8, v16
@@ -1569,31 +1557,39 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-NEXT: vsrl.vx v16, v8, a0
; RV32I-NEXT: vor.vv v8, v8, v16
; RV32I-NEXT: vnot.v v8, v8
-; RV32I-NEXT: vsrl.vi v16, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v24, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v16, v16, v24
-; RV32I-NEXT: vsub.vv v8, v8, v16
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v24, v8, v16
-; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v16
-; RV32I-NEXT: vadd.vv v8, v24, v8
-; RV32I-NEXT: vsrl.vi v16, v8, 4
-; RV32I-NEXT: vadd.vv v8, v8, v16
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32I-NEXT: vsrl.vi v8, v8, 1
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32I-NEXT: lui a0, 61681
; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT: vmv.v.x v16, a0
+; RV32I-NEXT: vmv.v.x v8, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vsll.vi v0, v8, 2
+; RV32I-NEXT: vxor.vv v0, v8, v0
+; RV32I-NEXT: vadd.vv v24, v0, v0
+; RV32I-NEXT: vxor.vv v24, v0, v24
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, sp, a0
+; RV32I-NEXT: addi a0, a0, 16
+; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vand.vv v24, v16, v24
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32I-NEXT: vsub.vv v16, v16, v24
+; RV32I-NEXT: vand.vv v24, v16, v0
+; RV32I-NEXT: vsrl.vi v16, v16, 2
+; RV32I-NEXT: vand.vv v16, v16, v0
+; RV32I-NEXT: vadd.vv v16, v24, v16
+; RV32I-NEXT: vsrl.vi v24, v16, 4
+; RV32I-NEXT: vadd.vv v16, v16, v24
+; RV32I-NEXT: vand.vv v8, v16, v8
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
@@ -1602,6 +1598,10 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
; RV32I-NEXT: vmul.vv v8, v8, v16
; RV32I-NEXT: li a0, 56
; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: csrr a0, vlenb
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add sp, sp, a0
+; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
; RV64I-LABEL: ctlz_nxv8i64:
@@ -2753,30 +2753,24 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v9
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v9, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; RV32I-NEXT: vmv.v.x v10, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v9, v9, v10
+; RV32I-NEXT: vsll.vi v11, v10, 2
+; RV32I-NEXT: vxor.vv v11, v10, v11
+; RV32I-NEXT: vadd.vv v12, v11, v11
+; RV32I-NEXT: vxor.vv v12, v11, v12
+; RV32I-NEXT: vand.vv v9, v9, v12
; RV32I-NEXT: vsub.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v11
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v9
-; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vand.vv v8, v8, v11
+; RV32I-NEXT: vadd.vv v8, v9, v8
; RV32I-NEXT: vsrl.vi v9, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v9
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT: vmv.v.x v9, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vand.vv v8, v8, v10
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma
@@ -2889,30 +2883,24 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v10
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v10, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; RV32I-NEXT: vmv.v.x v12, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v10, v10, v12
+; RV32I-NEXT: vsll.vi v14, v12, 2
+; RV32I-NEXT: vxor.vv v14, v12, v14
+; RV32I-NEXT: vadd.vv v16, v14, v14
+; RV32I-NEXT: vxor.vv v16, v14, v16
+; RV32I-NEXT: vand.vv v10, v10, v16
; RV32I-NEXT: vsub.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v14
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v10
-; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vand.vv v8, v8, v14
+; RV32I-NEXT: vadd.vv v8, v10, v8
; RV32I-NEXT: vsrl.vi v10, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v10
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT: vmv.v.x v10, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vand.vv v8, v8, v12
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma
@@ -3025,30 +3013,24 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
; RV32I-NEXT: vor.vv v8, v8, v12
; RV32I-NEXT: vnot.v v8, v8
; RV32I-NEXT: vsrl.vi v12, v8, 1
-; RV32I-NEXT: lui a0, 349525
-; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; RV32I-NEXT: vmv.v.x v16, a0
; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v12, v12, v16
+; RV32I-NEXT: vsll.vi v20, v16, 2
+; RV32I-NEXT: vxor.vv v20, v16, v20
+; RV32I-NEXT: vadd.vv v24, v20, v20
+; RV32I-NEXT: vxor.vv v24, v20, v24
+; RV32I-NEXT: vand.vv v12, v12, v24
; RV32I-NEXT: vsub.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 209715
-; RV32I-NEXT: addi a0, a0, 819
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v16, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v20
; RV32I-NEXT: vsrl.vi v8, v8, 2
-; RV32I-NEXT: vand.vv v8, v8, v12
-; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vand.vv v8, v8, v20
+; RV32I-NEXT: vadd.vv v8, v12, v8
; RV32I-NEXT: vsrl.vi v12, v8, 4
; RV32I-NEXT: vadd.vv v8, v8, v12
-; RV32I-NEXT: lui a0, 61681
-; RV32I-NEXT: addi a0, a0, -241
-; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT: vmv.v.x v12, a0
-; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vand.vv v8, v8, v16
; RV32I-NEXT: lui a0, 4112
; RV32I-NEXT: addi a0, a0, 257
; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma
@@ -3145,6 +3127,12 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
define <vscale x 8 x i64> @ctlz_zero_und...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/86659
More information about the llvm-commits
mailing list