[llvm] e9fa6ff - [RISCV] Fold (vXi8 (trunc (vselect (setltu, X, 256), X, (sext (setgt X, 0))))) to vmax+vnclipu. (#94720)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 7 09:57:08 PDT 2024
Author: Craig Topper
Date: 2024-06-07T09:57:03-07:00
New Revision: e9fa6ffaf7e86fe9f91fbcaabce5436311ac814c
URL: https://github.com/llvm/llvm-project/commit/e9fa6ffaf7e86fe9f91fbcaabce5436311ac814c
DIFF: https://github.com/llvm/llvm-project/commit/e9fa6ffaf7e86fe9f91fbcaabce5436311ac814c.diff
LOG: [RISCV] Fold (vXi8 (trunc (vselect (setltu, X, 256), X, (sext (setgt X, 0))))) to vmax+vnclipu. (#94720)
This pattern is an obscured way to express saturating a signed value
into a smaller unsigned value.
If (setltu, X, 256) is true, then the value is already in the desired
range so we can pick X. If it's false, we select (sext (setgt X, 0))
which is 0 for negative values and all ones for positive values. The all
ones value when truncated to the final type will still be all ones like
we want.
Added:
llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1eae1dc968db0..af3950773e4d0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1480,7 +1480,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.hasStdExtZbb())
setTargetDAGCombine({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN});
- if (Subtarget.hasStdExtZbs() && Subtarget.is64Bit())
+ if ((Subtarget.hasStdExtZbs() && Subtarget.is64Bit()) ||
+ Subtarget.hasStdExtV())
setTargetDAGCombine(ISD::TRUNCATE);
if (Subtarget.hasStdExtZbkb())
@@ -13404,6 +13405,76 @@ static SDValue combineDeMorganOfBoolean(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::XOR, DL, VT, Logic, DAG.getConstant(1, DL, VT));
}
+// Fold (vXi8 (trunc (vselect (setltu, X, 256), X, (sext (setgt X, 0))))) to
+// (vXi8 (trunc (smin (smax X, 0), 255))). This represents saturating a signed
+// value to an unsigned value. This will be lowered to vmax and series of
+// vnclipu instructions later. This can be extended to other truncated types
+// other than i8 by replacing 256 and 255 with the equivalent constants for the
+// type.
+static SDValue combineTruncSelectToSMaxUSat(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ EVT SrcVT = N0.getValueType();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!VT.isVector() || !TLI.isTypeLegal(VT) || !TLI.isTypeLegal(SrcVT))
+ return SDValue();
+
+ if (N0.getOpcode() != ISD::VSELECT || !N0.hasOneUse())
+ return SDValue();
+
+ SDValue Cond = N0.getOperand(0);
+ SDValue True = N0.getOperand(1);
+ SDValue False = N0.getOperand(2);
+
+ if (Cond.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ // FIXME: Support the version of this pattern with the select operands
+ // swapped.
+ ISD::CondCode CCVal = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ if (CCVal != ISD::SETULT)
+ return SDValue();
+
+ SDValue CondLHS = Cond.getOperand(0);
+ SDValue CondRHS = Cond.getOperand(1);
+
+ if (CondLHS != True)
+ return SDValue();
+
+ unsigned ScalarBits = VT.getScalarSizeInBits();
+
+ // FIXME: Support other constants.
+ ConstantSDNode *CondRHSC = isConstOrConstSplat(CondRHS);
+ if (!CondRHSC || CondRHSC->getAPIntValue() != (1ULL << ScalarBits))
+ return SDValue();
+
+ if (False.getOpcode() != ISD::SIGN_EXTEND)
+ return SDValue();
+
+ False = False.getOperand(0);
+
+ if (False.getOpcode() != ISD::SETCC || False.getOperand(0) != True)
+ return SDValue();
+
+ ConstantSDNode *FalseRHSC = isConstOrConstSplat(False.getOperand(1));
+ if (!FalseRHSC || !FalseRHSC->isZero())
+ return SDValue();
+
+ ISD::CondCode CCVal2 = cast<CondCodeSDNode>(False.getOperand(2))->get();
+ if (CCVal2 != ISD::SETGT)
+ return SDValue();
+
+ // Emit the signed to unsigned saturation pattern.
+ SDLoc DL(N);
+ SDValue Max =
+ DAG.getNode(ISD::SMAX, DL, SrcVT, True, DAG.getConstant(0, DL, SrcVT));
+ SDValue Min =
+ DAG.getNode(ISD::SMIN, DL, SrcVT, Max,
+ DAG.getConstant((1ULL << ScalarBits) - 1, DL, SrcVT));
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Min);
+}
+
static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
@@ -13424,7 +13495,7 @@ static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Srl);
}
- return SDValue();
+ return combineTruncSelectToSMaxUSat(N, DAG);
}
// Combines two comparison operation and logic operation to one selection
diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
new file mode 100644
index 0000000000000..28d7588b9347a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+define <4 x i8> @test_v4i16_v4i8(<4 x i16> %x) {
+; CHECK-LABEL: test_v4i16_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v8, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <4 x i16> %x, zeroinitializer
+ %b = sext <4 x i1> %a to <4 x i16>
+ %c = icmp ult <4 x i16> %x, splat (i16 256)
+ %d = select <4 x i1> %c, <4 x i16> %x, <4 x i16> %b
+ %e = trunc <4 x i16> %d to <4 x i8>
+ ret <4 x i8> %e
+}
+
+define <4 x i8> @test_v4i32_v4i8(<4 x i32> %x) {
+; CHECK-LABEL: test_v4i32_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v8, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v8, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <4 x i32> %x, zeroinitializer
+ %b = sext <4 x i1> %a to <4 x i32>
+ %c = icmp ult <4 x i32> %x, splat (i32 256)
+ %d = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %b
+ %e = trunc <4 x i32> %d to <4 x i8>
+ ret <4 x i8> %e
+}
+
+define <4 x i8> @test_v4i64_v4i8(<4 x i64> %x) {
+; CHECK-LABEL: test_v4i64_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vnclipu.wi v10, v8, 0
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v10, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v8, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <4 x i64> %x, zeroinitializer
+ %b = sext <4 x i1> %a to <4 x i64>
+ %c = icmp ult <4 x i64> %x, splat (i64 256)
+ %d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b
+ %e = trunc <4 x i64> %d to <4 x i8>
+ ret <4 x i8> %e
+}
+
+define <4 x i16> @test_v4i32_v4i16(<4 x i32> %x) {
+; CHECK-LABEL: test_v4i32_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v8, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <4 x i32> %x, zeroinitializer
+ %b = sext <4 x i1> %a to <4 x i32>
+ %c = icmp ult <4 x i32> %x, splat (i32 65536)
+ %d = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %b
+ %e = trunc <4 x i32> %d to <4 x i16>
+ ret <4 x i16> %e
+}
+
+define <4 x i16> @test_v4i64_v4i16(<4 x i64> %x) {
+; CHECK-LABEL: test_v4i64_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vnclipu.wi v10, v8, 0
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v10, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <4 x i64> %x, zeroinitializer
+ %b = sext <4 x i1> %a to <4 x i64>
+ %c = icmp ult <4 x i64> %x, splat (i64 65536)
+ %d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b
+ %e = trunc <4 x i64> %d to <4 x i16>
+ ret <4 x i16> %e
+}
+
+define <4 x i32> @test_v4i64_v4i32(<4 x i64> %x) {
+; CHECK-LABEL: test_v4i64_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vmax.vx v10, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v10, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <4 x i64> %x, zeroinitializer
+ %b = sext <4 x i1> %a to <4 x i64>
+ %c = icmp ult <4 x i64> %x, splat (i64 4294967296)
+ %d = select <4 x i1> %c, <4 x i64> %x, <4 x i64> %b
+ %e = trunc <4 x i64> %d to <4 x i32>
+ ret <4 x i32> %e
+}
+
+define <vscale x 4 x i8> @test_nxv4i16_nxv4i8(<vscale x 4 x i16> %x) {
+; CHECK-LABEL: test_nxv4i16_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v8, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <vscale x 4 x i16> %x, zeroinitializer
+ %b = sext <vscale x 4 x i1> %a to <vscale x 4 x i16>
+ %c = icmp ult <vscale x 4 x i16> %x, splat (i16 256)
+ %d = select <vscale x 4 x i1> %c, <vscale x 4 x i16> %x, <vscale x 4 x i16> %b
+ %e = trunc <vscale x 4 x i16> %d to <vscale x 4 x i8>
+ ret <vscale x 4 x i8> %e
+}
+
+define <vscale x 4 x i8> @test_nxv4i32_nxv4i8(<vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_nxv4i32_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vnclipu.wi v10, v8, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v10, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <vscale x 4 x i32> %x, zeroinitializer
+ %b = sext <vscale x 4 x i1> %a to <vscale x 4 x i32>
+ %c = icmp ult <vscale x 4 x i32> %x, splat (i32 256)
+ %d = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> %b
+ %e = trunc <vscale x 4 x i32> %d to <vscale x 4 x i8>
+ ret <vscale x 4 x i8> %e
+}
+
+define <vscale x 4 x i8> @test_nxv4i64_nxv4i8(<vscale x 4 x i64> %x) {
+; CHECK-LABEL: test_nxv4i64_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vnclipu.wi v12, v8, 0
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v12, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v8, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <vscale x 4 x i64> %x, zeroinitializer
+ %b = sext <vscale x 4 x i1> %a to <vscale x 4 x i64>
+ %c = icmp ult <vscale x 4 x i64> %x, splat (i64 256)
+ %d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> %x, <vscale x 4 x i64> %b
+ %e = trunc <vscale x 4 x i64> %d to <vscale x 4 x i8>
+ ret <vscale x 4 x i8> %e
+}
+
+define <vscale x 4 x i16> @test_nxv4i32_nxv4i16(<vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_nxv4i32_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmax.vx v10, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v10, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <vscale x 4 x i32> %x, zeroinitializer
+ %b = sext <vscale x 4 x i1> %a to <vscale x 4 x i32>
+ %c = icmp ult <vscale x 4 x i32> %x, splat (i32 65536)
+ %d = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> %b
+ %e = trunc <vscale x 4 x i32> %d to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %e
+}
+
+define <vscale x 4 x i16> @test_nxv4i64_nxv4i16(<vscale x 4 x i64> %x) {
+; CHECK-LABEL: test_nxv4i64_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vnclipu.wi v12, v8, 0
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v12, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <vscale x 4 x i64> %x, zeroinitializer
+ %b = sext <vscale x 4 x i1> %a to <vscale x 4 x i64>
+ %c = icmp ult <vscale x 4 x i64> %x, splat (i64 65536)
+ %d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> %x, <vscale x 4 x i64> %b
+ %e = trunc <vscale x 4 x i64> %d to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %e
+}
+
+define <vscale x 4 x i32> @test_nxv4i64_nxv4i32(<vscale x 4 x i64> %x) {
+; CHECK-LABEL: test_nxv4i64_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT: vmax.vx v12, v8, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v12, 0
+; CHECK-NEXT: ret
+ %a = icmp sgt <vscale x 4 x i64> %x, zeroinitializer
+ %b = sext <vscale x 4 x i1> %a to <vscale x 4 x i64>
+ %c = icmp ult <vscale x 4 x i64> %x, splat (i64 4294967296)
+ %d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> %x, <vscale x 4 x i64> %b
+ %e = trunc <vscale x 4 x i64> %d to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %e
+}
More information about the llvm-commits
mailing list