[llvm] d62d15e - [RISCV] Undo unprofitable zext of icmp combine (#134306)

via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 4 11:06:03 PDT 2025


Author: Luke Lau
Date: 2025-04-04T19:05:59+01:00
New Revision: d62d15e298ce323cb933f4949b42fe46dcb01b77

URL: https://github.com/llvm/llvm-project/commit/d62d15e298ce323cb933f4949b42fe46dcb01b77
DIFF: https://github.com/llvm/llvm-project/commit/d62d15e298ce323cb933f4949b42fe46dcb01b77.diff

LOG: [RISCV] Undo unprofitable zext of icmp combine (#134306)

InstCombine will combine this zext of an icmp where the source has a
single bit set to a lshr plus trunc
(`InstCombinerImpl::transformZExtICmp`):

```llvm
define <vscale x 1 x i8> @f(<vscale x 1 x i64> %x) {
  %1 = and <vscale x 1 x i64> %x, splat (i64 8)
  %2 = icmp ne <vscale x 1 x i64> %1, splat (i64 0)
  %3 = zext <vscale x 1 x i1> %2 to <vscale x 1 x i8>
  ret <vscale x 1 x i8> %3
}
```

```llvm
define <vscale x 1 x i8> @reverse_zexticmp_i64(<vscale x 1 x i64> %x) {
  %1 = trunc <vscale x 1 x i64> %x to <vscale x 1 x i8>
  %2 = lshr <vscale x 1 x i8> %1, splat (i8 2)
  %3 = and <vscale x 1 x i8> %2, splat (i8 1)
  ret <vscale x 1 x i8> %3
}
```

In a loop, this ends up being unprofitable for RISC-V because the
codegen now goes from:

```asm
f:                                      # @f
	.cfi_startproc
# %bb.0:
	vsetvli	a0, zero, e64, m1, ta, ma
	vand.vi	v8, v8, 8
	vmsne.vi	v0, v8, 0
	vsetvli	zero, zero, e8, mf8, ta, ma
	vmv.v.i	v8, 0
	vmerge.vim	v8, v8, 1, v0
	ret
```

To a series of narrowing vnsrl.wis:

```asm
f:                                      # @f
	.cfi_startproc
# %bb.0:
	vsetvli	a0, zero, e64, m1, ta, ma
	vand.vi	v8, v8, 8
	vsetvli	zero, zero, e32, mf2, ta, ma
	vnsrl.wi	v8, v8, 3
	vsetvli	zero, zero, e16, mf4, ta, ma
	vnsrl.wi	v8, v8, 0
	vsetvli	zero, zero, e8, mf8, ta, ma
	vnsrl.wi	v8, v8, 0
	ret
```

In the original form, the vmv.v.i is loop invariant and is hoisted out,
and the vmerge.vim usually gets folded away into a masked instruction,
so you usually just end up with a vsetvli + vmsne.vi.

The truncate requires multiple instructions and introduces a vtype
toggle for each one, and is measurably slower on the BPI-F3.

This reverses the transform in RISCVISelLowering for truncations greater
than twice the bitwidth, i.e. it keeps single vnsrl.wis.

Fixes #132245

Added: 
    llvm/test/CodeGen/RISCV/rvv/zext-icmp.ll

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8c409adedc2df..ad44ee755698a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15040,6 +15040,70 @@ static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
   return combineTruncSelectToSMaxUSat(N, DAG);
 }
 
+// InstCombinerImpl::transformZExtICmp will narrow a zext of an icmp with a
+// truncation. But RVV doesn't have truncation instructions for more than twice
+// the bitwidth.
+//
+// E.g. trunc <vscale x 1 x i64> %x to <vscale x 1 x i8> will generate:
+//
+//     vsetvli a0, zero, e32, m2, ta, ma
+//     vnsrl.wi v12, v8, 0
+//     vsetvli zero, zero, e16, m1, ta, ma
+//     vnsrl.wi v8, v12, 0
+//     vsetvli zero, zero, e8, mf2, ta, ma
+//     vnsrl.wi v8, v8, 0
+//
+// So reverse the combine so we generate an vmseq/vmsne again:
+//
+// and (lshr (trunc X), ShAmt), 1
+// -->
+// zext (icmp ne (and X, (1 << ShAmt)), 0)
+//
+// and (lshr (not (trunc X)), ShAmt), 1
+// -->
+// zext (icmp eq (and X, (1 << ShAmt)), 0)
+static SDValue reverseZExtICmpCombine(SDNode *N, SelectionDAG &DAG,
+                                      const RISCVSubtarget &Subtarget) {
+  using namespace SDPatternMatch;
+  SDLoc DL(N);
+
+  if (!Subtarget.hasVInstructions())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+
+  APInt ShAmt;
+  SDValue Inner;
+  if (!sd_match(N, m_And(m_OneUse(m_Srl(m_Value(Inner), m_ConstInt(ShAmt))),
+                         m_One())))
+    return SDValue();
+
+  SDValue X;
+  bool IsNot;
+  if (sd_match(Inner, m_Not(m_Trunc(m_Value(X)))))
+    IsNot = true;
+  else if (sd_match(Inner, m_Trunc(m_Value(X))))
+    IsNot = false;
+  else
+    return SDValue();
+
+  EVT WideVT = X.getValueType();
+  if (VT.getScalarSizeInBits() >= WideVT.getScalarSizeInBits() / 2)
+    return SDValue();
+
+  SDValue Res =
+      DAG.getNode(ISD::AND, DL, WideVT, X,
+                  DAG.getConstant(1 << ShAmt.getZExtValue(), DL, WideVT));
+  Res = DAG.getSetCC(DL,
+                     EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                      WideVT.getVectorElementCount()),
+                     Res, DAG.getConstant(0, DL, WideVT),
+                     IsNot ? ISD::SETEQ : ISD::SETNE);
+  return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res);
+}
+
 // Combines two comparison operation and logic operation to one selection
 // operation(min, max) and logic operation. Returns new constructed Node if
 // conditions for optimization are satisfied.
@@ -15067,6 +15131,9 @@ static SDValue performANDCombine(SDNode *N,
     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, And);
   }
 
+  if (SDValue V = reverseZExtICmpCombine(N, DAG, Subtarget))
+    return V;
+
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
   if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))

diff  --git a/llvm/test/CodeGen/RISCV/rvv/zext-icmp.ll b/llvm/test/CodeGen/RISCV/rvv/zext-icmp.ll
new file mode 100644
index 0000000000000..e5043281a27dd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/zext-icmp.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s
+
+; Test that we reverse InstCombinerImpl::transformZExtICmp when unprofitable
+
+define <vscale x 1 x i8> @reverse_zexticmp_i16(<vscale x 1 x i16> %x) {
+; CHECK-LABEL: reverse_zexticmp_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsrl.vi v8, v8, 2
+; CHECK-NEXT:    vand.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %1 = trunc <vscale x 1 x i16> %x to <vscale x 1 x i8>
+  %2 = lshr <vscale x 1 x i8> %1, splat (i8 2)
+  %3 = and <vscale x 1 x i8> %2, splat (i8 1)
+  ret <vscale x 1 x i8> %3
+}
+
+define <vscale x 1 x i8> @reverse_zexticmp_i32(<vscale x 1 x i32> %x) {
+; CHECK-LABEL: reverse_zexticmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vand.vi v8, v8, 4
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    ret
+  %1 = trunc <vscale x 1 x i32> %x to <vscale x 1 x i8>
+  %2 = lshr <vscale x 1 x i8> %1, splat (i8 2)
+  %3 = and <vscale x 1 x i8> %2, splat (i8 1)
+  ret <vscale x 1 x i8> %3
+}
+
+define <vscale x 1 x i8> @reverse_zexticmp_neg_i32(<vscale x 1 x i32> %x) {
+; CHECK-LABEL: reverse_zexticmp_neg_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vand.vi v8, v8, 4
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    ret
+  %1 = trunc <vscale x 1 x i32> %x to <vscale x 1 x i8>
+  %2 = xor <vscale x 1 x i8> %1, splat (i8 -1)
+  %3 = lshr <vscale x 1 x i8> %2, splat (i8 2)
+  %4 = and <vscale x 1 x i8> %3, splat (i8 1)
+  ret <vscale x 1 x i8> %4
+}
+
+define <vscale x 1 x i8> @reverse_zexticmp_i64(<vscale x 1 x i64> %x) {
+; CHECK-LABEL: reverse_zexticmp_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vand.vi v8, v8, 4
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    ret
+  %1 = trunc <vscale x 1 x i64> %x to <vscale x 1 x i8>
+  %2 = lshr <vscale x 1 x i8> %1, splat (i8 2)
+  %3 = and <vscale x 1 x i8> %2, splat (i8 1)
+  ret <vscale x 1 x i8> %3
+}
+
+define <vscale x 1 x i8> @reverse_zexticmp_neg_i64(<vscale x 1 x i64> %x) {
+; CHECK-LABEL: reverse_zexticmp_neg_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vand.vi v8, v8, 4
+; CHECK-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    ret
+  %1 = trunc <vscale x 1 x i64> %x to <vscale x 1 x i8>
+  %2 = xor <vscale x 1 x i8> %1, splat (i8 -1)
+  %3 = lshr <vscale x 1 x i8> %2, splat (i8 2)
+  %4 = and <vscale x 1 x i8> %3, splat (i8 1)
+  ret <vscale x 1 x i8> %4
+}
+


        


More information about the llvm-commits mailing list