[llvm] [WebAssembly] Optimize legalized vector multiplications into i32x4.dot_i16x8_s (PR #183244)

Wed Feb 25 21:35:10 PST 2026

https://github.com/ParkHanbum updated https://github.com/llvm/llvm-project/pull/183244

>From c6fbd779f5380a2621281d9e9777b8f99c8e148f Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Wed, 25 Feb 2026 14:44:37 +0900
Subject: [PATCH 1/6] add testcases for upcoming patch

---
 .../WebAssembly/simd-dot-reductions.ll        | 175 +++++++++++++++++-
 1 file changed, 172 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
index d9e5aba6f9b94..6cdccf5df1e4c 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mattr=+simd128 | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128 | FileCheck %s
 
 define <4 x i32> @dot_sext_1(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: dot_sext_1:
@@ -141,3 +139,174 @@ start:
   %4 = add nsw <4 x i32> %2, %3
   ret <4 x i32> %4
 }
+
+define void @load_sext_both(ptr %a, ptr %b, ptr %s) {
+; CHECK-LABEL: load_sext_both:
+; CHECK:         .functype load_sext_both (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %load = load <8 x i16>, ptr %a, align 16
+  %1 = sext <8 x i16> %load to <8 x i32>
+  %2 = load <8 x i16>, ptr %b, align 16
+  %3 = sext <8 x i16> %2 to <8 x i32>
+  %4 = mul nsw <8 x i32> %3, %1
+  %5 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %6 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %7 = add <4 x i32> %5, %6
+  store <4 x i32> %7, ptr %s, align 16
+  ret void
+}
+
+define void @add_from_same_base(ptr %a, ptr %s) {
+; CHECK-LABEL: add_from_same_base:
+; CHECK:         .functype add_from_same_base (i32, i32) -> ()
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %load = load <8 x i16>, ptr %a, align 16
+  %1 = sext <8 x i16> %load to <8 x i32>
+  %mul = mul nsw <8 x i32> %1, %1
+  %2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add nsw <4 x i32> %2, %3
+  store <4 x i32> %4, ptr %s, align 16
+  ret void
+}
+
+define <4 x i32> @combine_with_constant(<8 x i16> %v) {
+; CHECK-LABEL: combine_with_constant:
+; CHECK:         .functype combine_with_constant (v128) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extend_low_i16x8_s
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32.const 12
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32x4.replace_lane 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extract_lane 2
+; CHECK-NEXT:    i32.const 12
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32x4.replace_lane 2
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extend_high_i16x8_s
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32.const 12
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32x4.replace_lane 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extract_lane 2
+; CHECK-NEXT:    i32.const 12
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32x4.replace_lane 2
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %2, %3
+  ret <4 x i32> %4
+}
+
+define <4 x i32> @combine_with_shl(<8 x i16> %v) {
+; CHECK-LABEL: combine_with_shl:
+; CHECK:         .functype combine_with_shl (v128) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extend_low_i16x8_s
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32.const 12
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32x4.replace_lane 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extract_lane 2
+; CHECK-NEXT:    i32.const 12
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32x4.replace_lane 2
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extend_high_i16x8_s
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32.const 12
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32x4.replace_lane 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extract_lane 2
+; CHECK-NEXT:    i32.const 12
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32x4.replace_lane 2
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = shl nsw <8 x i32> %sext, <i32 12, i32 0, i32 12, i32 0, i32 12, i32 0, i32 12, i32 0>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %2, %3
+  ret <4 x i32> %4
+}
+
+; Negative - shifts by 16 overflow, so for x86 at least we don't optimize this.
+define <4 x i32> @combine_with_shl_overflow(<8 x i16> %v) {
+; CHECK-LABEL: combine_with_shl_overflow:
+; CHECK:         .functype combine_with_shl_overflow (v128) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extend_low_i16x8_u
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.extend_high_i16x8_u
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = shl nsw <8 x i32> %sext, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %2, %3
+  ret <4 x i32> %4
+}

>From 1625e83f61ee0e2ef82b530ee292e105fa4c1b5a Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Wed, 25 Feb 2026 14:47:33 +0900
Subject: [PATCH 2/6] [WebAssembly] Optimize legalized vector multiplications
 into i32x4.dot_i16x8_s

When performing multiplication and sign-extension from <8 x i16> to <8 x i32>,
Type Legalization shatters the DAG into a pair of BUILD_VECTORs fed by
EXTRACT_VECTOR_ELTs, resulting in inefficient instruction emission.

To recognize the legalized ADD(BUILD_VECTOR, BUILD_VECTOR) pattern and combine
it into the dedicated WebAssembly SIMD instruction i32x4.dot_i16x8_s.

Implemented combineToDOT helper in performADDCombine to emit a single
i32x4.dot_i16x8_s instruction.

Fixed: #179143
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 109 ++++++++++++++++++
 .../WebAssembly/simd-dot-reductions.ll        |  70 +----------
 2 files changed, 113 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index faea931aeccdc..667830946b8c1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -224,6 +224,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
                          ISD::CONCAT_VECTORS});
 
     setTargetDAGCombine(ISD::TRUNCATE);
+    setTargetDAGCombine(ISD::ADD);
 
     // Support saturating add/sub for i8x16 and i16x8
     for (auto Op : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
@@ -3714,6 +3715,112 @@ SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// Considering follwing pattern
+// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
+//      (add
+//        (build_vector (extract_elt t, 0),
+//                      (extract_elt t, 2),
+//                      (extract_elt t, 4),
+//                      (extract_elt t, 6)),
+//        (build_vector (extract_elt t, 1),
+//                      (extract_elt t, 3),
+//                      (extract_elt t, 5),
+//                      (extract_elt t, 7)))
+// will combine into
+// (v4i32 dot (x0, x1))
+static SDValue combineToDOT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  using namespace SDPatternMatch;
+  SelectionDAG &DAG = DCI.DAG;
+  if (!DAG.getSubtarget<WebAssemblySubtarget>().hasSIMD128())
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (LHS.getOpcode() != ISD::BUILD_VECTOR ||
+      RHS.getOpcode() != ISD::BUILD_VECTOR || LHS.getNumOperands() != 4 ||
+      RHS.getNumOperands() != 4)
+    return SDValue();
+
+  SDValue BaseVec;
+  for (unsigned I = 0; I < 4; ++I) {
+    SDValue OpL = LHS.getOperand(I);
+    SDValue OpR = RHS.getOperand(I);
+    SDValue VecL, VecR;
+    APInt IdxL, IdxR;
+    if (!sd_match(OpL, m_ExtractElt(m_Value(VecL), m_ConstInt(IdxL))) ||
+        !sd_match(OpR, m_ExtractElt(m_Value(VecR), m_ConstInt(IdxR))))
+      return SDValue();
+
+    if (!BaseVec) {
+      BaseVec = VecL;
+      unsigned Opc = BaseVec.getOpcode();
+      if ((Opc != ISD::MUL && Opc != ISD::SHL) ||
+          BaseVec.getValueType().getVectorNumElements() != 8)
+        return SDValue();
+    }
+
+    if (BaseVec != VecL || BaseVec != VecR)
+      return SDValue();
+
+    if (IdxL.getZExtValue() > IdxR.getZExtValue())
+      std::swap(IdxL, IdxR);
+    if (!(IdxL == I * 2 && IdxR == I * 2 + 1))
+      return SDValue();
+  }
+
+  SDValue DotLHS, DotRHS;
+  SDValue BaseLHS, BaseRHS;
+  unsigned BaseOpc = BaseVec.getOpcode();
+  if (BaseOpc == ISD::MUL &&
+      sd_match(BaseVec, m_Mul(m_UnaryOp(ISD::SIGN_EXTEND, m_Value(BaseLHS)),
+                              m_UnaryOp(ISD::SIGN_EXTEND, m_Value(BaseRHS))))) {
+    DotLHS = BaseLHS;
+    DotRHS = BaseRHS;
+  } else if (sd_match(BaseVec,
+                      m_Mul(m_UnaryOp(ISD::SIGN_EXTEND, m_Value(BaseLHS)),
+                            m_Value(BaseRHS))) ||
+             sd_match(BaseVec,
+                      m_Shl(m_UnaryOp(ISD::SIGN_EXTEND, m_Value(BaseLHS)),
+                            m_Value(BaseRHS)))) {
+    // (mul | shl) (sext(v8i16v LHS), v8i16 <C, ...>)
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(BaseRHS)) {
+      SmallVector<SDValue, 8> DotMultipliers;
+      for (unsigned I = 0; I < 8; ++I) {
+        auto *C = dyn_cast<ConstantSDNode>(BV->getOperand(I));
+        if (!C)
+          return SDValue();
+
+        uint64_t Val = C->getZExtValue();
+        if (BaseOpc == ISD::SHL)
+          Val = 1ULL << Val;
+        // A large multiplier (e.g. a shift by 15 or more) would overflow a
+        // signed i16.
+        if (!isInt<16>(Val))
+          return SDValue();
+
+        DotMultipliers.push_back(DAG.getConstant(Val, DL, MVT::i16));
+      }
+
+      DotLHS = BaseLHS;
+      DotRHS = DAG.getBuildVector(MVT::v8i16, DL, DotMultipliers);
+    }
+  }
+
+  if (!DotLHS || !DotRHS)
+    return SDValue();
+
+  return DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, DotLHS, DotRHS);
+}
+
+static SDValue performADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (SDValue DotCombine = combineToDOT(N, DCI))
+    return DotCombine;
+
+  return SDValue();
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -3749,5 +3856,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
     return performAnyAllCombine(N, DCI.DAG);
   case ISD::MUL:
     return performMulCombine(N, DCI);
+  case ISD::ADD:
+    return performADDCombine(N, DCI);
   }
 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
index 6cdccf5df1e4c..efb37db738628 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -190,41 +190,10 @@ define void @add_from_same_base(ptr %a, ptr %s) {
 define <4 x i32> @combine_with_constant(<8 x i16> %v) {
 ; CHECK-LABEL: combine_with_constant:
 ; CHECK:         .functype combine_with_constant (v128) -> (v128)
-; CHECK-NEXT:    .local v128
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.extend_low_i16x8_s
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32x4.extract_lane 0
-; CHECK-NEXT:    i32.const 12
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    i32x4.replace_lane 0
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32x4.extract_lane 2
-; CHECK-NEXT:    i32.const 12
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    i32x4.replace_lane 2
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.extend_high_i16x8_s
-; CHECK-NEXT:    local.tee 0
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.extract_lane 0
-; CHECK-NEXT:    i32.const 12
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    i32x4.replace_lane 0
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.extract_lane 2
-; CHECK-NEXT:    i32.const 12
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    i32x4.replace_lane 2
-; CHECK-NEXT:    local.tee 0
-; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    v128.const 4096, 1, 4096, 1, 4096, 1, 4096, 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
 ; CHECK-NEXT:    # fallthrough-return
   %sext = sext <8 x i16> %v to <8 x i32>
   %1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
@@ -237,41 +206,10 @@ define <4 x i32> @combine_with_constant(<8 x i16> %v) {
 define <4 x i32> @combine_with_shl(<8 x i16> %v) {
 ; CHECK-LABEL: combine_with_shl:
 ; CHECK:         .functype combine_with_shl (v128) -> (v128)
-; CHECK-NEXT:    .local v128
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.extend_low_i16x8_s
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32x4.extract_lane 0
-; CHECK-NEXT:    i32.const 12
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    i32x4.replace_lane 0
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32x4.extract_lane 2
-; CHECK-NEXT:    i32.const 12
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    i32x4.replace_lane 2
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.extend_high_i16x8_s
-; CHECK-NEXT:    local.tee 0
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.extract_lane 0
-; CHECK-NEXT:    i32.const 12
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    i32x4.replace_lane 0
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.extract_lane 2
-; CHECK-NEXT:    i32.const 12
-; CHECK-NEXT:    i32.shl
-; CHECK-NEXT:    i32x4.replace_lane 2
-; CHECK-NEXT:    local.tee 0
-; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    v128.const 4096, 1, 4096, 1, 4096, 1, 4096, 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
 ; CHECK-NEXT:    # fallthrough-return
   %sext = sext <8 x i16> %v to <8 x i32>
   %1 = shl nsw <8 x i32> %sext, <i32 12, i32 0, i32 12, i32 0, i32 12, i32 0, i32 12, i32 0>

>From 56a0750c7683f378fce7c264be76713c2b1603ca Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Wed, 25 Feb 2026 18:30:18 +0900
Subject: [PATCH 3/6] function comment has been revised to fully capture the
 function's purpose

---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 41 +++++++++++++------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 667830946b8c1..b8f0d3fc538b7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3715,19 +3715,34 @@ SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// Considering follwing pattern
-// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
-//      (add
-//        (build_vector (extract_elt t, 0),
-//                      (extract_elt t, 2),
-//                      (extract_elt t, 4),
-//                      (extract_elt t, 6)),
-//        (build_vector (extract_elt t, 1),
-//                      (extract_elt t, 3),
-//                      (extract_elt t, 5),
-//                      (extract_elt t, 7)))
-// will combine into
-// (v4i32 dot (x0, x1))
+// We are looking for patterns that represent a widened arithmetic operation
+// followed by a pairwise addition. This perfectly matches the semantics of
+// the WebAssembly i32x4.dot_i16x8_s instruction.
+//
+// Step 1. Widened arithmetic operation (8 lanes)
+// We match the following variations for 't' (v8i32):
+//  - Var * Var:    t = mul (sign_extend (v8i16 x0)), (sign_extend (v8i16 x1))
+//  - Var * Const:  t = mul (sign_extend (v8i16 x0)), (v8i32 cst)
+//  - Var << Const: t = shl (sign_extend (v8i16 x0)), (v8i32 cst)
+//
+// Step 2. Pairwise addition (v8i32 -> v4i32)
+// The Type Legalizer shatters 't' into build_vectors of extract_elts.
+//  Even: v4i32 = build_vector (extract_elt t, 0), (extract_elt t, 2),
+//                             (extract_elt t, 4), (extract_elt t, 6)
+//  Odd:  v4i32 = build_vector (extract_elt t, 1), (extract_elt t, 3),
+//                             (extract_elt t, 5), (extract_elt t, 7)
+//  Result: v4i32 = add Even, Odd
+//
+// Step 3. Combine into DOT
+// This entire DAG is folded into a single WebAssemblyISD::DOT node:
+//  - For Var * Var:    v4i32 = DOT (v8i16 x0), (v8i16 x1)
+//  - For Var * / << Const: The constants are reverse-computed into a new v8i16
+//                      vector, folding into DOT (v8i16 x0), (v8i16 new_cst)
+//
+// Mathematical Equivalent
+// This optimization ensures the following pairwise dot product is computed:
+//  Result[i] = (x0[2*i] * x1[2*i]) + (x0[2*i + 1] * x1[2*i + 1])
+//  (for i = 0, 1, 2, 3)
 static SDValue combineToDOT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   using namespace SDPatternMatch;
   SelectionDAG &DAG = DCI.DAG;

>From 20a5b2ebd45c51177236d3a7bbde1e28d88f04a5 Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Thu, 26 Feb 2026 14:31:57 +0900
Subject: [PATCH 4/6] fix wrong comparing of vector index

---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  4 +--
 .../WebAssembly/simd-dot-reductions.ll        | 25 +++++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index b8f0d3fc538b7..01627caa5aea1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3778,9 +3778,7 @@ static SDValue combineToDOT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     if (BaseVec != VecL || BaseVec != VecR)
       return SDValue();
 
-    if (IdxL.getZExtValue() > IdxR.getZExtValue())
-      std::swap(IdxL, IdxR);
-    if (!(IdxL == I * 2 && IdxR == I * 2 + 1))
+    if (IdxL.getZExtValue() != I * 2 || IdxR.getZExtValue()  != I * 2 + 1)
       return SDValue();
   }
 
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
index efb37db738628..9e0eccecf31dd 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -248,3 +248,28 @@ define <4 x i32> @combine_with_shl_overflow(<8 x i16> %v) {
   %4 = add <4 x i32> %2, %3
   ret <4 x i32> %4
 }
+
+; Negative - vector index is not composed of V[2*i] and V[2*i+1].
+define void @load_sext_both(ptr %a, ptr %b, ptr %s) {
+; CHECK-LABEL: load_sext_both:
+; CHECK:         .functype load_sext_both (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %load = load <8 x i16>, ptr %a, align 16
+  %1 = sext <8 x i16> %load to <8 x i32>
+  %2 = load <8 x i16>, ptr %b, align 16
+  %3 = sext <8 x i16> %2 to <8 x i32>
+  %4 = mul nsw <8 x i32> %3, %1
+  %5 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %6 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %7 = add <4 x i32> %5, %6
+  store <4 x i32> %7, ptr %s, align 16
+  ret void
+}

>From c8d9f0d30cb1413dc8b0275190ae4dff0da3b163 Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Thu, 26 Feb 2026 14:32:33 +0900
Subject: [PATCH 5/6] fix wrong written testcase comment

---
 llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
index 9e0eccecf31dd..ce8b6e0d54a2a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -219,7 +219,7 @@ define <4 x i32> @combine_with_shl(<8 x i16> %v) {
   ret <4 x i32> %4
 }
 
-; Negative - shifts by 16 overflow, so for x86 at least we don't optimize this.
+; Negative - shifts by 16 overflow, so we don't optimize this.
 define <4 x i32> @combine_with_shl_overflow(<8 x i16> %v) {
 ; CHECK-LABEL: combine_with_shl_overflow:
 ; CHECK:         .functype combine_with_shl_overflow (v128) -> (v128)

>From f7073d1df908a86eb264d7aba757485d2c5b2a9c Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Thu, 26 Feb 2026 14:34:55 +0900
Subject: [PATCH 6/6] fix formatting

---
 llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 01627caa5aea1..3579da47c32f9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3778,7 +3778,7 @@ static SDValue combineToDOT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     if (BaseVec != VecL || BaseVec != VecR)
       return SDValue();
 
-    if (IdxL.getZExtValue() != I * 2 || IdxR.getZExtValue()  != I * 2 + 1)
+    if (IdxL.getZExtValue() != I * 2 || IdxR.getZExtValue() != I * 2 + 1)
       return SDValue();
   }