[llvm] [X86] InstCombine: Generalize scalar SSE MAX/MIN intrinsics (PR #175375)

Sun Jan 11 08:57:42 PST 2026

https://github.com/guiolidc updated https://github.com/llvm/llvm-project/pull/175375

>From 34fa778699cb8320f1acf4543f9452ca8645be37 Mon Sep 17 00:00:00 2001
From: Guilherme Oliveira de Campos <oliveira.gui at hotmail.com.br>
Date: Sat, 10 Jan 2026 14:47:46 -0300
Subject: [PATCH] [X86] InstCombine: Generalize scalar SSE MAX/MIN intrinsics

This handles x86_sse_max_ss/min_ss and related intrinsics. It check if is known to be safe to convert them to llvm.maxnum/minnum
---
 .../Target/X86/X86InstCombineIntrinsic.cpp    | 49 ++++++++++++++----
 .../InstCombine/X86/x86-scalar-max-min.ll     | 51 +++++++++++++++++++
 2 files changed, 91 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/X86/x86-scalar-max-min.ll

diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 5eff38b214aef..76d543a9ab23f 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -1734,24 +1734,41 @@ static Value *simplifyTernarylogic(const IntrinsicInst &II,
   return Res.first;
 }
 
-static Value *simplifyX86FPMaxMin(const IntrinsicInst &II,
-                                  InstCombiner::BuilderTy &Builder,
-                                  Intrinsic::ID NewIID) {
+static Value *simplifyX86FPMaxMin(const IntrinsicInst &II, InstCombiner &IC,
+                                  Intrinsic::ID NewIID, bool IsScalar = false) {
 
   Value *Arg0 = II.getArgOperand(0);
   Value *Arg1 = II.getArgOperand(1);
+  unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
+
+  SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&II);
+  APInt DemandedElts =
+      IsScalar ? APInt::getOneBitSet(VWidth, 0) : APInt::getAllOnes(VWidth);
 
   // Verify that the inputs are not one of (NaN, Inf, Subnormal, NegZero),
   // otherwise we cannot safely generalize to MAXNUM/MINNUM.
   FPClassTest Forbidden = fcNan | fcInf | fcSubnormal | fcNegZero;
   KnownFPClass KnownArg0 =
-      computeKnownFPClass(Arg0, Forbidden, II.getDataLayout(), 0);
+      computeKnownFPClass(Arg0, DemandedElts, Forbidden, SQ);
   KnownFPClass KnownArg1 =
-      computeKnownFPClass(Arg1, Forbidden, II.getDataLayout(), 0);
+      computeKnownFPClass(Arg1, DemandedElts, Forbidden, SQ);
 
   if (KnownArg0.isKnownNever(Forbidden) && KnownArg1.isKnownNever(Forbidden)) {
-    return (NewIID == Intrinsic::maxnum) ? Builder.CreateMaxNum(Arg0, Arg1)
-                                         : Builder.CreateMinNum(Arg0, Arg1);
+    if (IsScalar) {
+      // It performs the operation on the first element and puts it back into
+      // the vector.
+      Value *Scalar0 = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
+      Value *Scalar1 = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
+
+      Value *NewScalar = (NewIID == Intrinsic::maxnum)
+                             ? IC.Builder.CreateMaxNum(Scalar0, Scalar1)
+                             : IC.Builder.CreateMinNum(Scalar0, Scalar1);
+      return IC.Builder.CreateInsertElement(Arg0, NewScalar, (uint64_t)0);
+    } else {
+      return (NewIID == Intrinsic::maxnum)
+                 ? IC.Builder.CreateMaxNum(Arg0, Arg1)
+                 : IC.Builder.CreateMinNum(Arg0, Arg1);
+    }
   }
 
   return nullptr;
@@ -3138,6 +3155,20 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       return IC.replaceInstUsesWith(II, V);
     }
     break;
+
+  case Intrinsic::x86_sse_max_ss:
+  case Intrinsic::x86_sse2_max_sd: {
+    if (Value *V = simplifyX86FPMaxMin(II, IC, Intrinsic::maxnum, true))
+      return IC.replaceInstUsesWith(II, V);
+    break;
+  }
+  case Intrinsic::x86_sse_min_ss:
+  case Intrinsic::x86_sse2_min_sd: {
+    if (Value *V = simplifyX86FPMaxMin(II, IC, Intrinsic::minimum, true))
+      return IC.replaceInstUsesWith(II, V);
+    break;
+  }
+
   default:
     break;
   }
@@ -3351,7 +3382,7 @@ std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
   case Intrinsic::x86_avx512fp16_max_ph_128:
   case Intrinsic::x86_avx512fp16_max_ph_256:
   case Intrinsic::x86_avx512fp16_max_ph_512:
-    if (Value *V = simplifyX86FPMaxMin(II, IC.Builder, Intrinsic::maxnum))
+    if (Value *V = simplifyX86FPMaxMin(II, IC, Intrinsic::maxnum))
       return IC.replaceInstUsesWith(II, V);
     break;
   case Intrinsic::x86_sse_min_ps:
@@ -3363,7 +3394,7 @@ std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
   case Intrinsic::x86_avx512fp16_min_ph_128:
   case Intrinsic::x86_avx512fp16_min_ph_256:
   case Intrinsic::x86_avx512fp16_min_ph_512:
-    if (Value *V = simplifyX86FPMaxMin(II, IC.Builder, Intrinsic::minnum))
+    if (Value *V = simplifyX86FPMaxMin(II, IC, Intrinsic::minnum))
       return IC.replaceInstUsesWith(II, V);
     break;
 
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-scalar-max-min.ll b/llvm/test/Transforms/InstCombine/X86/x86-scalar-max-min.ll
new file mode 100644
index 0000000000000..9b46a84316433
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-scalar-max-min.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+define <4 x float> @test_max_ss_nan(<4 x float> %a) {
+  ; We pass a vector where the bottom element is NaN
+; CHECK-LABEL: define <4 x float> @test_max_ss_nan(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[A]], <4 x float> <float 0x7FF8000000000000, float poison, float poison, float poison>)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> <float 0x7FF8000000000000, float 0.0, float 0.0, float 0.0>)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_min_ss_inf(<4 x float> %a) {
+; CHECK-LABEL: define <4 x float> @test_min_ss_inf(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[A]], <4 x float> <float 0x7FF0000000000000, float poison, float poison, float poison>)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> <float 0x7FF0000000000000, float 0.0, float 0.0, float 0.0>)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_max_ss_neg_zero(<4 x float> %a) {
+; CHECK-LABEL: define <4 x float> @test_max_ss_neg_zero(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[A]], <4 x float> <float -0.000000e+00, float poison, float poison, float poison>)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> <float -0.0, float 0.0, float 0.0, float 0.0>)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_max_ss_variable_unsafe(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: define <4 x float> @test_max_ss_variable_unsafe(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_max_ss_safe_constants() {
+; CHECK-LABEL: define <4 x float> @test_max_ss_safe_constants() {
+; CHECK-NEXT:    ret <4 x float> <float 2.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
+;
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> <float 1.0, float 0.0, float 0.0, float 0.0>, <4 x float> <float 2.0, float 0.0, float 0.0, float 0.0>)
+  ret <4 x float> %res
+}