[llvm] 038f7de - [DAGCombine] fp_to_sint isSaturatingMinMax

Mon Jan 30 04:25:31 PST 2023

Author: Samuel Parker
Date: 2023-01-30T12:25:25Z
New Revision: 038f7debfda01471ce0d4eb1fed20da61e5c8b32

URL: https://github.com/llvm/llvm-project/commit/038f7debfda01471ce0d4eb1fed20da61e5c8b32
DIFF: https://github.com/llvm/llvm-project/commit/038f7debfda01471ce0d4eb1fed20da61e5c8b32.diff

LOG: [DAGCombine] fp_to_sint isSaturatingMinMax

Recommitting after fixing scalable vector crash.

Check for single smax pattern against zero when converting from a
small enough float.

Differential Revision: https://reviews.llvm.org/D142481

Added: 
    llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll

Modified: 
    llvm/include/llvm/ADT/APFloat.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Support/APFloat.cpp
    llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
    llvm/test/CodeGen/WebAssembly/fpclamptosat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index c0e2d13c2939..cad6ef8caeb9 100644

--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -246,6 +246,7 @@ struct APFloatBase {
   static ExponentType semanticsMinExponent(const fltSemantics &);
   static ExponentType semanticsMaxExponent(const fltSemantics &);
   static unsigned int semanticsSizeInBits(const fltSemantics &);
+  static unsigned int semanticsIntSizeInBits(const fltSemantics&, bool);
 
   /// Returns the size of the floating point number (in bits) in the given
   /// semantics.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f32d3f0976e5..ad691ee53e95 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5108,7 +5108,7 @@ SDValue DAGCombiner::visitMULO(SDNode *N) {
 // same as SimplifySelectCC. N0<N1 ? N2 : N3.
 static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
                                   SDValue N3, ISD::CondCode CC, unsigned &BW,
-                                  bool &Unsigned) {
+                                  bool &Unsigned, SelectionDAG &DAG) {
   auto isSignedMinMax = [&](SDValue N0, SDValue N1, SDValue N2, SDValue N3,
                             ISD::CondCode CC) {
     // The compare and select operand should be the same or the select operands
@@ -5132,6 +5132,26 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
   if (!Opcode0)
     return SDValue();
 
+  // We could only need one range check, if the fptosi could never produce
+  // the upper value.
+  if (N0.getOpcode() == ISD::FP_TO_SINT && Opcode0 == ISD::SMAX) {
+    if (isNullOrNullSplat(N3)) {
+      EVT IntVT = N0.getValueType().getScalarType();
+      EVT FPVT = N0.getOperand(0).getValueType().getScalarType();
+      if (FPVT.isSimple()) {
+        Type *InputTy = FPVT.getTypeForEVT(*DAG.getContext());
+        const fltSemantics &Semantics = InputTy->getFltSemantics();
+        uint32_t MinBitWidth =
+          APFloatBase::semanticsIntSizeInBits(Semantics, /*isSigned*/ true);
+        if (IntVT.getSizeInBits() >= MinBitWidth) {
+          Unsigned = true;
+          BW = PowerOf2Ceil(MinBitWidth);
+          return N0;
+        }
+      }
+    }
+  }
+
   SDValue N00, N01, N02, N03;
   ISD::CondCode N0CC;
   switch (N0.getOpcode()) {
@@ -5194,7 +5214,7 @@ static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
                                            SelectionDAG &DAG) {
   unsigned BW;
   bool Unsigned;
-  SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW, Unsigned);
+  SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW, Unsigned, DAG);
   if (!Fp || Fp.getOpcode() != ISD::FP_TO_SINT)
     return SDValue();
   EVT FPVT = Fp.getOperand(0).getValueType();

diff  --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 9a8febad15f1..e9998d71b82a 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -251,6 +251,16 @@ namespace llvm {
   unsigned int APFloatBase::semanticsSizeInBits(const fltSemantics &semantics) {
     return semantics.sizeInBits;
   }
+  unsigned int APFloatBase::semanticsIntSizeInBits(const fltSemantics &semantics,
+                                                   bool isSigned) {
+    // The max FP value is pow(2, MaxExponent) * (1 + MaxFraction), so we need
+    // at least one more bit than the MaxExponent to hold the max FP value.
+    unsigned int MinBitWidth = semanticsMaxExponent(semantics) + 1;
+    // Extra sign bit needed.
+    if (isSigned)
+      ++MinBitWidth;
+    return MinBitWidth;
+  }
 
   unsigned APFloatBase::getSizeInBits(const fltSemantics &Sem) {
     return Sem.sizeInBits;

diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 3f851a2b2182..70ec15fca808 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -501,16 +501,12 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
     // If the integer type can hold the max FP value, it is safe to cast
     // directly to that type. Otherwise, we may create poison via overflow
     // that did not exist in the original code.
-    //
-    // The max FP value is pow(2, MaxExponent) * (1 + MaxFraction), so we need
-    // at least one more bit than the MaxExponent to hold the max FP value.
     Type *InputTy = I->getOperand(0)->getType()->getScalarType();
     const fltSemantics &Semantics = InputTy->getFltSemantics();
-    uint32_t MinBitWidth = APFloatBase::semanticsMaxExponent(Semantics);
-    // Extra sign bit needed.
-    if (I->getOpcode() == Instruction::FPToSI)
-      ++MinBitWidth;
-    return Ty->getScalarSizeInBits() > MinBitWidth;
+    uint32_t MinBitWidth =
+      APFloatBase::semanticsIntSizeInBits(Semantics,
+          I->getOpcode() == Instruction::FPToSI);
+    return Ty->getScalarSizeInBits() >= MinBitWidth;
   }
   default:
     // TODO: Can handle more cases here.

diff  --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
new file mode 100644
index 000000000000..2e6f704975bd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s
+
+define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, <vscale x 2 x ptr> %i37, <vscale x 2 x i64> %i42, <vscale x 2 x i64> %i54) {
+; CHECK-LABEL: scalable_int_min_max:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #3745
+; CHECK-NEXT:    movk w8, #16618, lsl #16
+; CHECK-NEXT:    ld1w { z3.d }, p0/z, [x0]
+; CHECK-NEXT:    mov w9, #57344
+; CHECK-NEXT:    mov z6.d, #1023 // =0x3ff
+; CHECK-NEXT:    movk w9, #17535, lsl #16
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    fmul z4.s, p0/m, z4.s, z3.s
+; CHECK-NEXT:    mov z5.s, w9
+; CHECK-NEXT:    fadd z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    mov z5.d, #0 // =0x0
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.s
+; CHECK-NEXT:    sxtw z5.d, p0/m, z5.d
+; CHECK-NEXT:    smax z4.d, p0/m, z4.d, z5.d
+; CHECK-NEXT:    movprfx z5, z6
+; CHECK-NEXT:    sxtw z5.d, p0/m, z6.d
+; CHECK-NEXT:    smin z4.d, p0/m, z4.d, z5.d
+; CHECK-NEXT:    cmpne p1.d, p0/z, z4.d, #0
+; CHECK-NEXT:    ld1w { z5.d }, p1/z, [x1]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
+; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z5.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    cmpne p2.d, p2/z, z4.d, #0
+; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    add z2.d, p1/m, z2.d, z1.d
+; CHECK-NEXT:    uaddv d0, p0, z2.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %i56 = getelementptr inbounds float, ptr %arg, i64 0
+  %i57 = load <vscale x 2 x float>, ptr %i56, align 4
+  %i58 = fmul <vscale x 2 x float> %i57, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 0x401D41D420000000, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
+  %i59 = fadd <vscale x 2 x float> %i58, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.023500e+03, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
+  %i60 = fptosi <vscale x 2 x float> %i59 to <vscale x 2 x i32>
+  %i61 = tail call <vscale x 2 x i32> @llvm.smax.nxv2i32(<vscale x 2 x i32> %i60, <vscale x 2 x i32> zeroinitializer)
+  %i62 = tail call <vscale x 2 x i32> @llvm.smin.nxv2i32(<vscale x 2 x i32> %i61, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1023, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer))
+  %i63 = icmp ne <vscale x 2 x i32> %i62, zeroinitializer
+  %i64 = getelementptr float, ptr %arg1, i64 0
+  %i65 = tail call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr %i64, i32 4, <vscale x 2 x i1> %i63, <vscale x 2 x float> poison)
+  %i66 = tail call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> %i37, i32 4, <vscale x 2 x i1> %i63, <vscale x 2 x float> poison)
+  %i67 = fadd <vscale x 2 x float> %i65, %i66
+  %i68 = fcmp ult <vscale x 2 x float> %i67, %i57
+  %i74 = select <vscale x 2 x i1> %i63, <vscale x 2 x i1> %i68, <vscale x 2 x i1> zeroinitializer
+  %i75 = select <vscale x 2 x i1> %i74, <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> %i42
+  %i76 = select <vscale x 2 x i1> %i63, <vscale x 2 x i64> %i75, <vscale x 2 x i64> zeroinitializer
+  %i77 = add <vscale x 2 x i64> %i54, %i76
+  %i116 = tail call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %i77)
+  ret i64 %i116
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <vscale x 2 x i32> @llvm.smax.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>) #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare <vscale x 2 x i32> @llvm.smin.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x float>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr>, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x float>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>) #3
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(read) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) }

diff  --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll
index 4a1a9a2b3cf0..9a39c62591e1 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll
@@ -191,19 +191,11 @@ entry:
 define i32 @ustest_f16i32_cse(half %x) {
 ; CHECK-LABEL: ustest_f16i32_cse:
 ; CHECK:         .functype ustest_f16i32_cse (f32) -> (i32)
-; CHECK-NEXT:    .local i64
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __truncsfhf2
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    i64.trunc_sat_f32_s
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    i64.const 0
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i64.const 0
-; CHECK-NEXT:    i64.gt_s
-; CHECK-NEXT:    i64.select
-; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
   %conv = fptosi half %x to i64
@@ -485,18 +477,11 @@ entry:
 define i16 @ustest_f16i16_cse(half %x) {
 ; CHECK-LABEL: ustest_f16i16_cse:
 ; CHECK:         .functype ustest_f16i16_cse (f32) -> (i32)
-; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __truncsfhf2
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    i32.trunc_sat_f32_s
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    i32.gt_s
-; CHECK-NEXT:    i32.select
+; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
   %conv = fptosi half %x to i32
@@ -1214,19 +1199,11 @@ entry:
 define i32 @ustest_f16i32_mm_cse(half %x) {
 ; CHECK-LABEL: ustest_f16i32_mm_cse:
 ; CHECK:         .functype ustest_f16i32_mm_cse (f32) -> (i32)
-; CHECK-NEXT:    .local i64
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __truncsfhf2
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    i64.trunc_sat_f32_s
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    i64.const 0
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i64.const 0
-; CHECK-NEXT:    i64.gt_s
-; CHECK-NEXT:    i64.select
-; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
   %conv = fptosi half %x to i64
@@ -1477,18 +1454,11 @@ entry:
 define i16 @ustest_f16i16_mm_cse(half %x) {
 ; CHECK-LABEL: ustest_f16i16_mm_cse:
 ; CHECK:         .functype ustest_f16i16_mm_cse (f32) -> (i32)
-; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __truncsfhf2
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    i32.trunc_sat_f32_s
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    i32.gt_s
-; CHECK-NEXT:    i32.select
+; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
   %conv = fptosi half %x to i32