[llvm] [X86] optimize ssse3 horizontal saturating add/sub (PR #169591)

Thu Nov 27 03:21:19 PST 2025

https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/169591

>From 46d83df2e27b979f00570aeeddada93f908ce9bc Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Wed, 26 Nov 2025 02:08:19 +0100
Subject: [PATCH 1/6] optimize ssse3 horizontal saturating add/sub

---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  32 ++++-
 llvm/lib/Target/X86/X86ISelLowering.h        |   4 +
 llvm/lib/Target/X86/X86InstrFragmentsSIMD.td |   2 +
 llvm/lib/Target/X86/X86InstrSSE.td           |   6 +
 llvm/test/CodeGen/X86/haddsubsat.ll          | 139 +++++++++++++++++++
 5 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/haddsubsat.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d49f25a950e3a..be93ffefa4225 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2654,6 +2654,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::AVGFLOORU,
                        ISD::BITREVERSE,
                        ISD::ADD,
+                       ISD::SADDSAT,
+                       ISD::SSUBSAT,
                        ISD::FADD,
                        ISD::FSUB,
                        ISD::FNEG,
@@ -8114,6 +8116,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
   case X86ISD::FHSUB:
   case X86ISD::HADD:
   case X86ISD::HSUB:
+  case X86ISD::HADDS:
+  case X86ISD::HSUBS:
     return true;
   }
   return false;
@@ -34984,6 +34988,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BLENDV)
   NODE_NAME_CASE(HADD)
   NODE_NAME_CASE(HSUB)
+  NODE_NAME_CASE(HADDS)
+  NODE_NAME_CASE(HSUBS)
   NODE_NAME_CASE(FHADD)
   NODE_NAME_CASE(FHSUB)
   NODE_NAME_CASE(CONFLICT)
@@ -54034,7 +54040,8 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   unsigned Opcode = N->getOpcode();
-  bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
+  bool IsAdd =
+      (Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT);
   SmallVector<int, 8> PostShuffleMask;
 
   auto MergableHorizOp = [N](unsigned HorizOpcode) {
@@ -54084,6 +54091,27 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
       }
     }
     break;
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
+    if (Subtarget.hasSSSE3() && VT == MVT::v8i16) {
+      SDValue LHS = N->getOperand(0);
+      SDValue RHS = N->getOperand(1);
+      auto HorizOpcode = IsAdd ? X86ISD::HADDS : X86ISD::HSUBS;
+      if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
+                            PostShuffleMask, MergableHorizOp(HorizOpcode))) {
+        auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+                                        ArrayRef<SDValue> Ops) {
+          return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
+        };
+        SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+                                              {LHS, RHS}, HOpBuilder);
+        if (!PostShuffleMask.empty())
+          HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+                                            DAG.getUNDEF(VT), PostShuffleMask);
+        return HorizBinOp;
+      }
+    }
+    break;
   }
 
   return SDValue();
@@ -60793,6 +60821,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
   case X86ISD::ADD:
   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI, Subtarget);
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:        return combineToHorizontalAddSub(N, DAG, Subtarget);
   case X86ISD::CLOAD:
   case X86ISD::CSTORE:      return combineX86CloadCstore(N, DAG);
   case X86ISD::SBB:         return combineSBB(N, DAG);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e28b9c11a04cd..8425e18d0b35e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -270,6 +270,10 @@ namespace llvm {
     HADD,
     HSUB,
 
+    /// Integer horizontal saturating add/sub.
+    HADDS,
+    HSUBS,
+
     /// Floating point horizontal add/sub.
     FHADD,
     FHSUB,
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 5321ecf0c1b2c..0803a4946b379 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -71,6 +71,8 @@ def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
 def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
 def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
+def X86hadds   : SDNode<"X86ISD::HADDS",     SDTIntBinOp>;
+def X86hsubs   : SDNode<"X86ISD::HSUBS",     SDTIntBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86FCmp>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86FCmp>;
 def X86comi512       : SDNode<"X86ISD::COMX",      SDTX86FCmp>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 806b02b9f9359..ee16eaa0462ea 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4949,6 +4949,12 @@ defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
                                  VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
 }
 
+def : Pat<(v8i16 (X86hadds VR128:$src1, VR128:$src2)),
+          (PHADDSWrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(v8i16 (X86hsubs VR128:$src1, VR128:$src2)),
+          (PHSUBSWrr VR128:$src1, VR128:$src2)>;
+
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Align Instruction Patterns
 //===---------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/haddsubsat.ll b/llvm/test/CodeGen/X86/haddsubsat.ll
new file mode 100644
index 0000000000000..d7fd38c623c41
--- /dev/null
+++ b/llvm/test/CodeGen/X86/haddsubsat.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 -x86-asm-syntax=intel | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 -x86-asm-syntax=intel | FileCheck %s -check-prefix=AVX2
+
+define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v8i16_intrinsic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phaddsw xmm0, xmm1
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phaddsw_v8i16_intrinsic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vphaddsw xmm0, xmm0, xmm1
+; AVX2-NEXT:    ret
+entry:
+  %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v8i16_generic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phaddsw xmm0, xmm1
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phaddsw_v8i16_generic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    phaddsw xmm0, xmm1
+; AVX2-NEXT:    ret
+entry:
+  %even = shufflevector <8 x i16> %a, <8 x i16> %b,
+    <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %odd  = shufflevector <8 x i16> %a, <8 x i16> %b,
+    <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
+  ret <8 x i16> %sum
+}
+
+define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v16i16_generic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phaddsw xmm0, xmm1
+; SSSE3-NEXT:    phaddsw xmm2, xmm3
+; SSSE3-NEXT:    movdqa xmm1, xmm2
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phaddsw_v16i16_generic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-NEXT:    vpshufb ymm1, ymm1, ymm3
+; AVX2-NEXT:    vpshufb ymm0, ymm0, ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpaddsw ymm0, ymm2, ymm0
+; AVX2-NEXT:    ret
+entry:
+  %even = shufflevector <16 x i16> %a, <16 x i16> %b,
+    <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
+                i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %odd  = shufflevector <16 x i16> %a, <16 x i16> %b,
+    <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
+                i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
+  ret <16 x i16> %sum
+}
+
+define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v8i16_intrinsic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phsubsw xmm0, xmm1
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phsubsw_v8i16_intrinsic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vphsubsw xmm0, xmm0, xmm1
+; AVX2-NEXT:    ret
+entry:
+  %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v8i16_generic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phsubsw xmm0, xmm1
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phsubsw_v8i16_generic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    phsubsw xmm0, xmm1
+; AVX2-NEXT:    ret
+entry:
+  %even = shufflevector <8 x i16> %a, <8 x i16> %b,
+    <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %odd  = shufflevector <8 x i16> %a, <8 x i16> %b,
+    <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
+  ret <8 x i16> %diff
+}
+
+define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v16i16_generic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phsubsw xmm0, xmm1
+; SSSE3-NEXT:    phsubsw xmm2, xmm3
+; SSSE3-NEXT:    movdqa xmm1, xmm2
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phsubsw_v16i16_generic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-NEXT:    vpshufb ymm1, ymm1, ymm3
+; AVX2-NEXT:    vpshufb ymm0, ymm0, ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpsubsw ymm0, ymm2, ymm0
+; AVX2-NEXT:    ret
+entry:
+  %even = shufflevector <16 x i16> %a, <16 x i16> %b,
+    <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
+                i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %odd  = shufflevector <16 x i16> %a, <16 x i16> %b,
+    <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
+                i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
+  ret <16 x i16> %diff
+}

>From a9ad357eeca4a2decb3b9fd637729ecaf2df0195 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Wed, 26 Nov 2025 15:16:49 +0100
Subject: [PATCH 2/6] move patterns into `X86IntrinsicsInfo.h`

---
 llvm/lib/Target/X86/X86InstrSSE.td      | 40 ++++++++++---------------
 llvm/lib/Target/X86/X86IntrinsicsInfo.h |  4 +++
 llvm/test/CodeGen/X86/haddsubsat.ll     |  4 +--
 3 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index ee16eaa0462ea..e4aaa1e1b594a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4864,12 +4864,12 @@ let isCommutable = 0 in {
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
                                       SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
-  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
-                                      int_x86_ssse3_phadd_sw_128,
-                                      SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
-  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
-                                      int_x86_ssse3_phsub_sw_128,
-                                      SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
+  defm VPHADDSW    : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v8i16, v8i16, VR128,
+                                  load, i128mem,
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
+  defm VPHSUBSW    : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v8i16, v8i16, VR128,
+                                  load, i128mem,
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
 }
 }
 
@@ -4907,12 +4907,12 @@ let isCommutable = 0 in {
                                        SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
   defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
                                        SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
-  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
-                                       int_x86_avx2_phadd_sw,
-                                       SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
-  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
-                                       int_x86_avx2_phsub_sw,
-                                       SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
+  defm VPHADDSWY  : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v16i16, v16i16,
+                                  VR256, load, i256mem,
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
+  defm VPHSUBSWY  : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v16i16, v16i16,
+                                  VR256, load, i256mem,
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
 }
 }
 
@@ -4935,12 +4935,10 @@ let isCommutable = 0 in {
                                      SchedWriteVecALU.XMM, memop>;
   defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
                                  memop, i128mem, SchedWriteVarShuffle.XMM>;
-  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
-                                     int_x86_ssse3_phadd_sw_128,
-                                     SchedWritePHAdd.XMM, memop>;
-  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
-                                     int_x86_ssse3_phsub_sw_128,
-                                     SchedWritePHAdd.XMM, memop>;
+  defm PHADDSW   : SS3I_binop_rm<0x03, "phaddsw", X86hadds, v8i16, v8i16, VR128,
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
+  defm PHSUBSW   : SS3I_binop_rm<0x07, "phsubsw", X86hsubs, v8i16, v8i16, VR128,
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
                                  v16i8, VR128, memop, i128mem,
                                  SchedWriteVecIMul.XMM>;
@@ -4949,12 +4947,6 @@ defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
                                  VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
 }
 
-def : Pat<(v8i16 (X86hadds VR128:$src1, VR128:$src2)),
-          (PHADDSWrr VR128:$src1, VR128:$src2)>;
-
-def : Pat<(v8i16 (X86hsubs VR128:$src1, VR128:$src2)),
-          (PHSUBSWrr VR128:$src1, VR128:$src2)>;
-
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Align Instruction Patterns
 //===---------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 0f725a8eb338b..99665b5872fe2 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -724,8 +724,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
     X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
     X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
+    X86_INTRINSIC_DATA(avx2_phadd_sw, INTR_TYPE_2OP, X86ISD::HADDS, 0),
     X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
     X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+    X86_INTRINSIC_DATA(avx2_phsub_sw, INTR_TYPE_2OP, X86ISD::HSUBS, 0),
     X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
     X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
     X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
@@ -2017,11 +2019,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_sw_128, INTR_TYPE_2OP, X86ISD::HADDS, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_sw_128, INTR_TYPE_2OP, X86ISD::HSUBS, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
     X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0),
diff --git a/llvm/test/CodeGen/X86/haddsubsat.ll b/llvm/test/CodeGen/X86/haddsubsat.ll
index d7fd38c623c41..21d39b90dcb4b 100644
--- a/llvm/test/CodeGen/X86/haddsubsat.ll
+++ b/llvm/test/CodeGen/X86/haddsubsat.ll
@@ -25,7 +25,7 @@ define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; AVX2-LABEL: phaddsw_v8i16_generic:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    phaddsw xmm0, xmm1
+; AVX2-NEXT:    vphaddsw xmm0, xmm0, xmm1
 ; AVX2-NEXT:    ret
 entry:
   %even = shufflevector <8 x i16> %a, <8 x i16> %b,
@@ -93,7 +93,7 @@ define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; AVX2-LABEL: phsubsw_v8i16_generic:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    phsubsw xmm0, xmm1
+; AVX2-NEXT:    vphsubsw xmm0, xmm0, xmm1
 ; AVX2-NEXT:    ret
 entry:
   %even = shufflevector <8 x i16> %a, <8 x i16> %b,

>From 4f903bb5cfbd6176df514cace79c1ca2f0ffcbf1 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Wed, 26 Nov 2025 15:22:20 +0100
Subject: [PATCH 3/6] handle 265-bit inputs when avx2 is enabled

---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  2 +-
 llvm/test/CodeGen/X86/haddsubsat.ll     | 24 ++----------------------
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index be93ffefa4225..4d1c9de52fc22 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54093,7 +54093,7 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
     break;
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
-    if (Subtarget.hasSSSE3() && VT == MVT::v8i16) {
+    if ((Subtarget.hasSSSE3() && VT == MVT::v8i16) || (Subtarget.hasAVX2() && VT == MVT::v16i16)) {
       SDValue LHS = N->getOperand(0);
       SDValue RHS = N->getOperand(1);
       auto HorizOpcode = IsAdd ? X86ISD::HADDS : X86ISD::HSUBS;
diff --git a/llvm/test/CodeGen/X86/haddsubsat.ll b/llvm/test/CodeGen/X86/haddsubsat.ll
index 21d39b90dcb4b..eb8f7bf34cec6 100644
--- a/llvm/test/CodeGen/X86/haddsubsat.ll
+++ b/llvm/test/CodeGen/X86/haddsubsat.ll
@@ -46,18 +46,8 @@ define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
 ;
 ; AVX2-LABEL: phaddsw_v16i16_generic:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX2-NEXT:    vpshufb ymm1, ymm1, ymm3
-; AVX2-NEXT:    vpshufb ymm0, ymm0, ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vphaddsw ymm0, ymm0, ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpaddsw ymm0, ymm2, ymm0
 ; AVX2-NEXT:    ret
 entry:
   %even = shufflevector <16 x i16> %a, <16 x i16> %b,
@@ -114,18 +104,8 @@ define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
 ;
 ; AVX2-LABEL: phsubsw_v16i16_generic:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX2-NEXT:    vpshufb ymm1, ymm1, ymm3
-; AVX2-NEXT:    vpshufb ymm0, ymm0, ymm3
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vphsubsw ymm0, ymm0, ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpsubsw ymm0, ymm2, ymm0
 ; AVX2-NEXT:    ret
 entry:
   %even = shufflevector <16 x i16> %a, <16 x i16> %b,

>From ba98cf2362d5b0ce920b1c5ae14580f57d0229fa Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Wed, 26 Nov 2025 15:34:41 +0100
Subject: [PATCH 4/6] combine wrapping and saturating logic

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 62 ++++++++++---------------
 1 file changed, 25 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4d1c9de52fc22..12d7d449d8beb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54042,6 +54042,7 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
   unsigned Opcode = N->getOpcode();
   bool IsAdd =
       (Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT);
+  bool IsSat = (Opcode == ISD::SADDSAT) || (Opcode == ISD::SSUBSAT);
   SmallVector<int, 8> PostShuffleMask;
 
   auto MergableHorizOp = [N](unsigned HorizOpcode) {
@@ -54071,45 +54072,32 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
     break;
   case ISD::ADD:
   case ISD::SUB:
-    if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
-                                 VT == MVT::v16i16 || VT == MVT::v8i32)) {
-      SDValue LHS = N->getOperand(0);
-      SDValue RHS = N->getOperand(1);
-      auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
-      if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
-                            PostShuffleMask, MergableHorizOp(HorizOpcode))) {
-        auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
-                                        ArrayRef<SDValue> Ops) {
-          return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
-        };
-        SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
-                                              {LHS, RHS}, HOpBuilder);
-        if (!PostShuffleMask.empty())
-          HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
-                                            DAG.getUNDEF(VT), PostShuffleMask);
-        return HorizBinOp;
-      }
-    }
-    break;
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
-    if ((Subtarget.hasSSSE3() && VT == MVT::v8i16) || (Subtarget.hasAVX2() && VT == MVT::v16i16)) {
-      SDValue LHS = N->getOperand(0);
-      SDValue RHS = N->getOperand(1);
-      auto HorizOpcode = IsAdd ? X86ISD::HADDS : X86ISD::HSUBS;
-      if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
-                            PostShuffleMask, MergableHorizOp(HorizOpcode))) {
-        auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
-                                        ArrayRef<SDValue> Ops) {
-          return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
-        };
-        SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
-                                              {LHS, RHS}, HOpBuilder);
-        if (!PostShuffleMask.empty())
-          HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
-                                            DAG.getUNDEF(VT), PostShuffleMask);
-        return HorizBinOp;
-      }
+    if (IsSat && !((Subtarget.hasSSSE3() && VT == MVT::v8i16) ||
+                   (Subtarget.hasAVX2() && VT == MVT::v16i16)))
+      break;
+    if (!IsSat &&
+        !(Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+                                   VT == MVT::v16i16 || VT == MVT::v8i32)))
+      break;
+
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+    auto HorizOpcode = IsSat ? (IsAdd ? X86ISD::HADDS : X86ISD::HSUBS)
+                             : (IsAdd ? X86ISD::HADD : X86ISD::HSUB);
+    if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
+                          PostShuffleMask, MergableHorizOp(HorizOpcode))) {
+      auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+                                      ArrayRef<SDValue> Ops) {
+        return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
+      };
+      SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+                                            {LHS, RHS}, HOpBuilder);
+      if (!PostShuffleMask.empty())
+        HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+                                          DAG.getUNDEF(VT), PostShuffleMask);
+      return HorizBinOp;
     }
     break;
   }

>From 61e65507455c4af2f3807bbab1dfce0ae8b31f27 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Thu, 27 Nov 2025 11:00:40 +0100
Subject: [PATCH 5/6] format test file

---
 llvm/test/CodeGen/X86/haddsubsat.ll | 118 ++++++++++++----------------
 1 file changed, 50 insertions(+), 68 deletions(-)

diff --git a/llvm/test/CodeGen/X86/haddsubsat.ll b/llvm/test/CodeGen/X86/haddsubsat.ll
index eb8f7bf34cec6..588f3383ec415 100644
--- a/llvm/test/CodeGen/X86/haddsubsat.ll
+++ b/llvm/test/CodeGen/X86/haddsubsat.ll
@@ -1,119 +1,101 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 -x86-asm-syntax=intel | FileCheck %s -check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 -x86-asm-syntax=intel | FileCheck %s -check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=AVX2
 
 define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
 ; SSSE3-LABEL: phaddsw_v8i16_intrinsic:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    phaddsw xmm0, xmm1
-; SSSE3-NEXT:    ret
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    phaddsw %xmm1, %xmm0
+; SSSE3-NEXT:    retq
 ;
 ; AVX2-LABEL: phaddsw_v8i16_intrinsic:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vphaddsw xmm0, xmm0, xmm1
-; AVX2-NEXT:    ret
-entry:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %res
 }
 
 define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
 ; SSSE3-LABEL: phaddsw_v8i16_generic:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    phaddsw xmm0, xmm1
-; SSSE3-NEXT:    ret
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    phaddsw %xmm1, %xmm0
+; SSSE3-NEXT:    retq
 ;
 ; AVX2-LABEL: phaddsw_v8i16_generic:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vphaddsw xmm0, xmm0, xmm1
-; AVX2-NEXT:    ret
-entry:
-  %even = shufflevector <8 x i16> %a, <8 x i16> %b,
-    <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %odd  = shufflevector <8 x i16> %a, <8 x i16> %b,
-    <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %odd  = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
   ret <8 x i16> %sum
 }
 
 define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
 ; SSSE3-LABEL: phaddsw_v16i16_generic:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    phaddsw xmm0, xmm1
-; SSSE3-NEXT:    phaddsw xmm2, xmm3
-; SSSE3-NEXT:    movdqa xmm1, xmm2
-; SSSE3-NEXT:    ret
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    phaddsw %xmm1, %xmm0
+; SSSE3-NEXT:    phaddsw %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    retq
 ;
 ; AVX2-LABEL: phaddsw_v16i16_generic:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vphaddsw ymm0, ymm0, ymm1
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    ret
-entry:
-  %even = shufflevector <16 x i16> %a, <16 x i16> %b,
-    <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
-                i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-  %odd  = shufflevector <16 x i16> %a, <16 x i16> %b,
-    <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
-                i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; AVX2-NEXT:    retq
+  %even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %odd  = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   %sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
   ret <16 x i16> %sum
 }
 
 define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
 ; SSSE3-LABEL: phsubsw_v8i16_intrinsic:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    phsubsw xmm0, xmm1
-; SSSE3-NEXT:    ret
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    phsubsw %xmm1, %xmm0
+; SSSE3-NEXT:    retq
 ;
 ; AVX2-LABEL: phsubsw_v8i16_intrinsic:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vphsubsw xmm0, xmm0, xmm1
-; AVX2-NEXT:    ret
-entry:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %res
 }
 
 define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
 ; SSSE3-LABEL: phsubsw_v8i16_generic:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    phsubsw xmm0, xmm1
-; SSSE3-NEXT:    ret
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    phsubsw %xmm1, %xmm0
+; SSSE3-NEXT:    retq
 ;
 ; AVX2-LABEL: phsubsw_v8i16_generic:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vphsubsw xmm0, xmm0, xmm1
-; AVX2-NEXT:    ret
-entry:
-  %even = shufflevector <8 x i16> %a, <8 x i16> %b,
-    <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %odd  = shufflevector <8 x i16> %a, <8 x i16> %b,
-    <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %odd  = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
   ret <8 x i16> %diff
 }
 
 define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
 ; SSSE3-LABEL: phsubsw_v16i16_generic:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    phsubsw xmm0, xmm1
-; SSSE3-NEXT:    phsubsw xmm2, xmm3
-; SSSE3-NEXT:    movdqa xmm1, xmm2
-; SSSE3-NEXT:    ret
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    phsubsw %xmm1, %xmm0
+; SSSE3-NEXT:    phsubsw %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    retq
 ;
 ; AVX2-LABEL: phsubsw_v16i16_generic:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vphsubsw ymm0, ymm0, ymm1
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    ret
-entry:
-  %even = shufflevector <16 x i16> %a, <16 x i16> %b,
-    <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
-                i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-  %odd  = shufflevector <16 x i16> %a, <16 x i16> %b,
-    <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
-                i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; AVX2-NEXT:    retq
+  %even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %odd  = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   %diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
   ret <16 x i16> %diff
 }

>From 4eb0f3ffcbe52f93c1e0a99f7ec244430322649f Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Thu, 27 Nov 2025 11:57:45 +0100
Subject: [PATCH 6/6] add hsat opcodes to `canonicalizeShuffleMaskWithHorizOp`

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 47 ++++++++++++-------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 12d7d449d8beb..cc4fe89572b69 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40862,8 +40862,9 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
       }))
     return SDValue();
 
-  bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
-                  Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
+  bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::FHSUB ||
+                  Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::HSUB ||
+                  Opcode0 == X86ISD::HADDS || Opcode0 == X86ISD::HSUBS);
   bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
   if (!isHoriz && !isPack)
     return SDValue();
@@ -54074,30 +54075,28 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
   case ISD::SUB:
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
-    if (IsSat && !((Subtarget.hasSSSE3() && VT == MVT::v8i16) ||
-                   (Subtarget.hasAVX2() && VT == MVT::v16i16)))
-      break;
-    if (!IsSat &&
-        !(Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
-                                   VT == MVT::v16i16 || VT == MVT::v8i32)))
+    if (!Subtarget.hasSSSE3())
       break;
+    if (VT == MVT::v8i16 || VT == MVT::v16i16 ||
+        (!IsSat && (VT == MVT::v4i32 || VT == MVT::v8i32))) {
 
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-    auto HorizOpcode = IsSat ? (IsAdd ? X86ISD::HADDS : X86ISD::HSUBS)
-                             : (IsAdd ? X86ISD::HADD : X86ISD::HSUB);
-    if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
-                          PostShuffleMask, MergableHorizOp(HorizOpcode))) {
-      auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
-                                      ArrayRef<SDValue> Ops) {
-        return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
-      };
-      SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
-                                            {LHS, RHS}, HOpBuilder);
-      if (!PostShuffleMask.empty())
-        HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
-                                          DAG.getUNDEF(VT), PostShuffleMask);
-      return HorizBinOp;
+      SDValue LHS = N->getOperand(0);
+      SDValue RHS = N->getOperand(1);
+      auto HorizOpcode = IsSat ? (IsAdd ? X86ISD::HADDS : X86ISD::HSUBS)
+                               : (IsAdd ? X86ISD::HADD : X86ISD::HSUB);
+      if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
+                            PostShuffleMask, MergableHorizOp(HorizOpcode))) {
+        auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+                                        ArrayRef<SDValue> Ops) {
+          return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
+        };
+        SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+                                              {LHS, RHS}, HOpBuilder);
+        if (!PostShuffleMask.empty())
+          HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+                                            DAG.getUNDEF(VT), PostShuffleMask);
+        return HorizBinOp;
+      }
     }
     break;
   }