[llvm] [WebAssembly] Optimize legalized vector multiplications into i32x4.dot_i16x8_s (PR #183244)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 25 21:35:10 PST 2026
https://github.com/ParkHanbum updated https://github.com/llvm/llvm-project/pull/183244
>From c6fbd779f5380a2621281d9e9777b8f99c8e148f Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Wed, 25 Feb 2026 14:44:37 +0900
Subject: [PATCH 1/6] add testcases for upcoming patch
---
.../WebAssembly/simd-dot-reductions.ll | 175 +++++++++++++++++-
1 file changed, 172 insertions(+), 3 deletions(-)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
index d9e5aba6f9b94..6cdccf5df1e4c 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -1,7 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mattr=+simd128 | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128 | FileCheck %s
define <4 x i32> @dot_sext_1(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: dot_sext_1:
@@ -141,3 +139,174 @@ start:
%4 = add nsw <4 x i32> %2, %3
ret <4 x i32> %4
}
+
+define void @load_sext_both(ptr %a, ptr %b, ptr %s) {
+; CHECK-LABEL: load_sext_both:
+; CHECK: .functype load_sext_both (i32, i32, i32) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: v128.load 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.load 0
+; CHECK-NEXT: i32x4.dot_i16x8_s
+; CHECK-NEXT: v128.store 0
+; CHECK-NEXT: # fallthrough-return
+ %load = load <8 x i16>, ptr %a, align 16
+ %1 = sext <8 x i16> %load to <8 x i32>
+ %2 = load <8 x i16>, ptr %b, align 16
+ %3 = sext <8 x i16> %2 to <8 x i32>
+ %4 = mul nsw <8 x i32> %3, %1
+ %5 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %6 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %7 = add <4 x i32> %5, %6
+ store <4 x i32> %7, ptr %s, align 16
+ ret void
+}
+
+define void @add_from_same_base(ptr %a, ptr %s) {
+; CHECK-LABEL: add_from_same_base:
+; CHECK: .functype add_from_same_base (i32, i32) -> ()
+; CHECK-NEXT: .local v128
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.load 0
+; CHECK-NEXT: local.tee 2
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i32x4.dot_i16x8_s
+; CHECK-NEXT: v128.store 0
+; CHECK-NEXT: # fallthrough-return
+ %load = load <8 x i16>, ptr %a, align 16
+ %1 = sext <8 x i16> %load to <8 x i32>
+ %mul = mul nsw <8 x i32> %1, %1
+ %2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %3 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %4 = add nsw <4 x i32> %2, %3
+ store <4 x i32> %4, ptr %s, align 16
+ ret void
+}
+
+define <4 x i32> @combine_with_constant(<8 x i16> %v) {
+; CHECK-LABEL: combine_with_constant:
+; CHECK: .functype combine_with_constant (v128) -> (v128)
+; CHECK-NEXT: .local v128
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
+; CHECK-NEXT: local.tee 1
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extract_lane 0
+; CHECK-NEXT: i32.const 12
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32x4.replace_lane 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extract_lane 2
+; CHECK-NEXT: i32.const 12
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32x4.replace_lane 2
+; CHECK-NEXT: local.tee 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extend_high_i16x8_s
+; CHECK-NEXT: local.tee 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extract_lane 0
+; CHECK-NEXT: i32.const 12
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32x4.replace_lane 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extract_lane 2
+; CHECK-NEXT: i32.const 12
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32x4.replace_lane 2
+; CHECK-NEXT: local.tee 0
+; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT: i32x4.add
+; CHECK-NEXT: # fallthrough-return
+ %sext = sext <8 x i16> %v to <8 x i32>
+ %1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
+ %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %4 = add <4 x i32> %2, %3
+ ret <4 x i32> %4
+}
+
+define <4 x i32> @combine_with_shl(<8 x i16> %v) {
+; CHECK-LABEL: combine_with_shl:
+; CHECK: .functype combine_with_shl (v128) -> (v128)
+; CHECK-NEXT: .local v128
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
+; CHECK-NEXT: local.tee 1
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extract_lane 0
+; CHECK-NEXT: i32.const 12
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32x4.replace_lane 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extract_lane 2
+; CHECK-NEXT: i32.const 12
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32x4.replace_lane 2
+; CHECK-NEXT: local.tee 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extend_high_i16x8_s
+; CHECK-NEXT: local.tee 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extract_lane 0
+; CHECK-NEXT: i32.const 12
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32x4.replace_lane 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extract_lane 2
+; CHECK-NEXT: i32.const 12
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32x4.replace_lane 2
+; CHECK-NEXT: local.tee 0
+; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT: i32x4.add
+; CHECK-NEXT: # fallthrough-return
+ %sext = sext <8 x i16> %v to <8 x i32>
+ %1 = shl nsw <8 x i32> %sext, <i32 12, i32 0, i32 12, i32 0, i32 12, i32 0, i32 12, i32 0>
+ %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %4 = add <4 x i32> %2, %3
+ ret <4 x i32> %4
+}
+
+; Negative - shifts by 16 overflow, so for x86 at least we don't optimize this.
+define <4 x i32> @combine_with_shl_overflow(<8 x i16> %v) {
+; CHECK-LABEL: combine_with_shl_overflow:
+; CHECK: .functype combine_with_shl_overflow (v128) -> (v128)
+; CHECK-NEXT: .local v128
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
+; CHECK-NEXT: i32.const 16
+; CHECK-NEXT: i32x4.shl
+; CHECK-NEXT: local.tee 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.extend_high_i16x8_u
+; CHECK-NEXT: i32.const 16
+; CHECK-NEXT: i32x4.shl
+; CHECK-NEXT: local.tee 0
+; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT: i32x4.add
+; CHECK-NEXT: # fallthrough-return
+ %sext = sext <8 x i16> %v to <8 x i32>
+ %1 = shl nsw <8 x i32> %sext, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %4 = add <4 x i32> %2, %3
+ ret <4 x i32> %4
+}
>From 1625e83f61ee0e2ef82b530ee292e105fa4c1b5a Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Wed, 25 Feb 2026 14:47:33 +0900
Subject: [PATCH 2/6] [WebAssembly] Optimize legalized vector multiplications
into i32x4.dot_i16x8_s
When performing multiplication and sign-extension from <8 x i16> to <8 x i32>,
Type Legalization shatters the DAG into a pair of BUILD_VECTORs fed by
EXTRACT_VECTOR_ELTs, resulting in inefficient instruction emission.
To recognize the legalized ADD(BUILD_VECTOR, BUILD_VECTOR) pattern and combine
it into the dedicated WebAssembly SIMD instruction i32x4.dot_i16x8_s.
Implemented combineToDOT helper in performADDCombine to emit a single
i32x4.dot_i16x8_s instruction.
Fixed: #179143
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 109 ++++++++++++++++++
.../WebAssembly/simd-dot-reductions.ll | 70 +----------
2 files changed, 113 insertions(+), 66 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index faea931aeccdc..667830946b8c1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -224,6 +224,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
ISD::CONCAT_VECTORS});
setTargetDAGCombine(ISD::TRUNCATE);
+ setTargetDAGCombine(ISD::ADD);
// Support saturating add/sub for i8x16 and i16x8
for (auto Op : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
@@ -3714,6 +3715,112 @@ SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+// Considering follwing pattern
+// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
+// (add
+// (build_vector (extract_elt t, 0),
+// (extract_elt t, 2),
+// (extract_elt t, 4),
+// (extract_elt t, 6)),
+// (build_vector (extract_elt t, 1),
+// (extract_elt t, 3),
+// (extract_elt t, 5),
+// (extract_elt t, 7)))
+// will combine into
+// (v4i32 dot (x0, x1))
+static SDValue combineToDOT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ using namespace SDPatternMatch;
+ SelectionDAG &DAG = DCI.DAG;
+ if (!DAG.getSubtarget<WebAssemblySubtarget>().hasSIMD128())
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if (LHS.getOpcode() != ISD::BUILD_VECTOR ||
+ RHS.getOpcode() != ISD::BUILD_VECTOR || LHS.getNumOperands() != 4 ||
+ RHS.getNumOperands() != 4)
+ return SDValue();
+
+ SDValue BaseVec;
+ for (unsigned I = 0; I < 4; ++I) {
+ SDValue OpL = LHS.getOperand(I);
+ SDValue OpR = RHS.getOperand(I);
+ SDValue VecL, VecR;
+ APInt IdxL, IdxR;
+ if (!sd_match(OpL, m_ExtractElt(m_Value(VecL), m_ConstInt(IdxL))) ||
+ !sd_match(OpR, m_ExtractElt(m_Value(VecR), m_ConstInt(IdxR))))
+ return SDValue();
+
+ if (!BaseVec) {
+ BaseVec = VecL;
+ unsigned Opc = BaseVec.getOpcode();
+ if ((Opc != ISD::MUL && Opc != ISD::SHL) ||
+ BaseVec.getValueType().getVectorNumElements() != 8)
+ return SDValue();
+ }
+
+ if (BaseVec != VecL || BaseVec != VecR)
+ return SDValue();
+
+ if (IdxL.getZExtValue() > IdxR.getZExtValue())
+ std::swap(IdxL, IdxR);
+ if (!(IdxL == I * 2 && IdxR == I * 2 + 1))
+ return SDValue();
+ }
+
+ SDValue DotLHS, DotRHS;
+ SDValue BaseLHS, BaseRHS;
+ unsigned BaseOpc = BaseVec.getOpcode();
+ if (BaseOpc == ISD::MUL &&
+ sd_match(BaseVec, m_Mul(m_UnaryOp(ISD::SIGN_EXTEND, m_Value(BaseLHS)),
+ m_UnaryOp(ISD::SIGN_EXTEND, m_Value(BaseRHS))))) {
+ DotLHS = BaseLHS;
+ DotRHS = BaseRHS;
+ } else if (sd_match(BaseVec,
+ m_Mul(m_UnaryOp(ISD::SIGN_EXTEND, m_Value(BaseLHS)),
+ m_Value(BaseRHS))) ||
+ sd_match(BaseVec,
+ m_Shl(m_UnaryOp(ISD::SIGN_EXTEND, m_Value(BaseLHS)),
+ m_Value(BaseRHS)))) {
+ // (mul | shl) (sext(v8i16v LHS), v8i16 <C, ...>)
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(BaseRHS)) {
+ SmallVector<SDValue, 8> DotMultipliers;
+ for (unsigned I = 0; I < 8; ++I) {
+ auto *C = dyn_cast<ConstantSDNode>(BV->getOperand(I));
+ if (!C)
+ return SDValue();
+
+ uint64_t Val = C->getZExtValue();
+ if (BaseOpc == ISD::SHL)
+ Val = 1ULL << Val;
+ // A large multiplier (e.g. a shift by 15 or more) would overflow a
+ // signed i16.
+ if (!isInt<16>(Val))
+ return SDValue();
+
+ DotMultipliers.push_back(DAG.getConstant(Val, DL, MVT::i16));
+ }
+
+ DotLHS = BaseLHS;
+ DotRHS = DAG.getBuildVector(MVT::v8i16, DL, DotMultipliers);
+ }
+ }
+
+ if (!DotLHS || !DotRHS)
+ return SDValue();
+
+ return DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, DotLHS, DotRHS);
+}
+
+static SDValue performADDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (SDValue DotCombine = combineToDOT(N, DCI))
+ return DotCombine;
+
+ return SDValue();
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -3749,5 +3856,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return performAnyAllCombine(N, DCI.DAG);
case ISD::MUL:
return performMulCombine(N, DCI);
+ case ISD::ADD:
+ return performADDCombine(N, DCI);
}
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
index 6cdccf5df1e4c..efb37db738628 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -190,41 +190,10 @@ define void @add_from_same_base(ptr %a, ptr %s) {
define <4 x i32> @combine_with_constant(<8 x i16> %v) {
; CHECK-LABEL: combine_with_constant:
; CHECK: .functype combine_with_constant (v128) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32x4.extend_low_i16x8_s
-; CHECK-NEXT: local.tee 1
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i32x4.extract_lane 0
-; CHECK-NEXT: i32.const 12
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: i32x4.replace_lane 0
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i32x4.extract_lane 2
-; CHECK-NEXT: i32.const 12
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: i32x4.replace_lane 2
-; CHECK-NEXT: local.tee 1
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32x4.extend_high_i16x8_s
-; CHECK-NEXT: local.tee 0
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32x4.extract_lane 0
-; CHECK-NEXT: i32.const 12
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: i32x4.replace_lane 0
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32x4.extract_lane 2
-; CHECK-NEXT: i32.const 12
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: i32x4.replace_lane 2
-; CHECK-NEXT: local.tee 0
-; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK-NEXT: i32x4.add
+; CHECK-NEXT: v128.const 4096, 1, 4096, 1, 4096, 1, 4096, 1
+; CHECK-NEXT: i32x4.dot_i16x8_s
; CHECK-NEXT: # fallthrough-return
%sext = sext <8 x i16> %v to <8 x i32>
%1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
@@ -237,41 +206,10 @@ define <4 x i32> @combine_with_constant(<8 x i16> %v) {
define <4 x i32> @combine_with_shl(<8 x i16> %v) {
; CHECK-LABEL: combine_with_shl:
; CHECK: .functype combine_with_shl (v128) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32x4.extend_low_i16x8_s
-; CHECK-NEXT: local.tee 1
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i32x4.extract_lane 0
-; CHECK-NEXT: i32.const 12
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: i32x4.replace_lane 0
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i32x4.extract_lane 2
-; CHECK-NEXT: i32.const 12
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: i32x4.replace_lane 2
-; CHECK-NEXT: local.tee 1
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32x4.extend_high_i16x8_s
-; CHECK-NEXT: local.tee 0
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32x4.extract_lane 0
-; CHECK-NEXT: i32.const 12
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: i32x4.replace_lane 0
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32x4.extract_lane 2
-; CHECK-NEXT: i32.const 12
-; CHECK-NEXT: i32.shl
-; CHECK-NEXT: i32x4.replace_lane 2
-; CHECK-NEXT: local.tee 0
-; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK-NEXT: i32x4.add
+; CHECK-NEXT: v128.const 4096, 1, 4096, 1, 4096, 1, 4096, 1
+; CHECK-NEXT: i32x4.dot_i16x8_s
; CHECK-NEXT: # fallthrough-return
%sext = sext <8 x i16> %v to <8 x i32>
%1 = shl nsw <8 x i32> %sext, <i32 12, i32 0, i32 12, i32 0, i32 12, i32 0, i32 12, i32 0>
>From 56a0750c7683f378fce7c264be76713c2b1603ca Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Wed, 25 Feb 2026 18:30:18 +0900
Subject: [PATCH 3/6] function comment has been revised to fully capture the
function's purpose
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 41 +++++++++++++------
1 file changed, 28 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 667830946b8c1..b8f0d3fc538b7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3715,19 +3715,34 @@ SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// Considering follwing pattern
-// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
-// (add
-// (build_vector (extract_elt t, 0),
-// (extract_elt t, 2),
-// (extract_elt t, 4),
-// (extract_elt t, 6)),
-// (build_vector (extract_elt t, 1),
-// (extract_elt t, 3),
-// (extract_elt t, 5),
-// (extract_elt t, 7)))
-// will combine into
-// (v4i32 dot (x0, x1))
+// We are looking for patterns that represent a widened arithmetic operation
+// followed by a pairwise addition. This perfectly matches the semantics of
+// the WebAssembly i32x4.dot_i16x8_s instruction.
+//
+// Step 1. Widened arithmetic operation (8 lanes)
+// We match the following variations for 't' (v8i32):
+// - Var * Var: t = mul (sign_extend (v8i16 x0)), (sign_extend (v8i16 x1))
+// - Var * Const: t = mul (sign_extend (v8i16 x0)), (v8i32 cst)
+// - Var << Const: t = shl (sign_extend (v8i16 x0)), (v8i32 cst)
+//
+// Step 2. Pairwise addition (v8i32 -> v4i32)
+// The Type Legalizer shatters 't' into build_vectors of extract_elts.
+// Even: v4i32 = build_vector (extract_elt t, 0), (extract_elt t, 2),
+// (extract_elt t, 4), (extract_elt t, 6)
+// Odd: v4i32 = build_vector (extract_elt t, 1), (extract_elt t, 3),
+// (extract_elt t, 5), (extract_elt t, 7)
+// Result: v4i32 = add Even, Odd
+//
+// Step 3. Combine into DOT
+// This entire DAG is folded into a single WebAssemblyISD::DOT node:
+// - For Var * Var: v4i32 = DOT (v8i16 x0), (v8i16 x1)
+// - For Var * / << Const: The constants are reverse-computed into a new v8i16
+// vector, folding into DOT (v8i16 x0), (v8i16 new_cst)
+//
+// Mathematical Equivalent
+// This optimization ensures the following pairwise dot product is computed:
+// Result[i] = (x0[2*i] * x1[2*i]) + (x0[2*i + 1] * x1[2*i + 1])
+// (for i = 0, 1, 2, 3)
static SDValue combineToDOT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
using namespace SDPatternMatch;
SelectionDAG &DAG = DCI.DAG;
>From 20a5b2ebd45c51177236d3a7bbde1e28d88f04a5 Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Thu, 26 Feb 2026 14:31:57 +0900
Subject: [PATCH 4/6] fix wrong comparing of vector index
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 4 +--
.../WebAssembly/simd-dot-reductions.ll | 25 +++++++++++++++++++
2 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index b8f0d3fc538b7..01627caa5aea1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3778,9 +3778,7 @@ static SDValue combineToDOT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
if (BaseVec != VecL || BaseVec != VecR)
return SDValue();
- if (IdxL.getZExtValue() > IdxR.getZExtValue())
- std::swap(IdxL, IdxR);
- if (!(IdxL == I * 2 && IdxR == I * 2 + 1))
+ if (IdxL.getZExtValue() != I * 2 || IdxR.getZExtValue() != I * 2 + 1)
return SDValue();
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
index efb37db738628..9e0eccecf31dd 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -248,3 +248,28 @@ define <4 x i32> @combine_with_shl_overflow(<8 x i16> %v) {
%4 = add <4 x i32> %2, %3
ret <4 x i32> %4
}
+
+; Negative - vector index is not composed of V[2*i] and V[2*i+1].
+define void @load_sext_both(ptr %a, ptr %b, ptr %s) {
+; CHECK-LABEL: load_sext_both:
+; CHECK: .functype load_sext_both (i32, i32, i32) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: v128.load 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.load 0
+; CHECK-NEXT: i32x4.dot_i16x8_s
+; CHECK-NEXT: v128.store 0
+; CHECK-NEXT: # fallthrough-return
+ %load = load <8 x i16>, ptr %a, align 16
+ %1 = sext <8 x i16> %load to <8 x i32>
+ %2 = load <8 x i16>, ptr %b, align 16
+ %3 = sext <8 x i16> %2 to <8 x i32>
+ %4 = mul nsw <8 x i32> %3, %1
+ %5 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %6 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %7 = add <4 x i32> %5, %6
+ store <4 x i32> %7, ptr %s, align 16
+ ret void
+}
>From c8d9f0d30cb1413dc8b0275190ae4dff0da3b163 Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Thu, 26 Feb 2026 14:32:33 +0900
Subject: [PATCH 5/6] fix wrong written testcase comment
---
llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
index 9e0eccecf31dd..ce8b6e0d54a2a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -219,7 +219,7 @@ define <4 x i32> @combine_with_shl(<8 x i16> %v) {
ret <4 x i32> %4
}
-; Negative - shifts by 16 overflow, so for x86 at least we don't optimize this.
+; Negative - shifts by 16 overflow, so we don't optimize this.
define <4 x i32> @combine_with_shl_overflow(<8 x i16> %v) {
; CHECK-LABEL: combine_with_shl_overflow:
; CHECK: .functype combine_with_shl_overflow (v128) -> (v128)
>From f7073d1df908a86eb264d7aba757485d2c5b2a9c Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Thu, 26 Feb 2026 14:34:55 +0900
Subject: [PATCH 6/6] fix formatting
---
llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 01627caa5aea1..3579da47c32f9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3778,7 +3778,7 @@ static SDValue combineToDOT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
if (BaseVec != VecL || BaseVec != VecR)
return SDValue();
- if (IdxL.getZExtValue() != I * 2 || IdxR.getZExtValue() != I * 2 + 1)
+ if (IdxL.getZExtValue() != I * 2 || IdxR.getZExtValue() != I * 2 + 1)
return SDValue();
}
More information about the llvm-commits
mailing list