[llvm] [WebAssembly] Fold extended vector shifts by constant to extmul (PR #184007)

Sun Mar 1 06:13:57 PST 2026

https://github.com/ParkHanbum created https://github.com/llvm/llvm-project/pull/184007

Vector shifts of extended operands by a constant vector were lowered
into independent extend and shift nodes.
Example: `shl (WebAssemblyISD::EXTEND_LOW_S t1), <12, 0, 12, 0>`

WebAssembly SIMD lacks extended shifts but supports extended
multiplications. Converting the shift constant into a multiplier and
wrapping it in an extend node normalizes the DAG for extmul selection.

The selector matches the mul(ext, ext) structure into extmul, using
explicit undef padding to fulfill the 128-bit register constraint.

Fixed: https://github.com/llvm/llvm-project/issues/179143

>From 7d01dfa96bcf7d935772da0cb4aaa58c0d207541 Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Sun, 1 Mar 2026 18:26:51 +0900
Subject: [PATCH 1/2] add testcase for upcoming patch

---
 .../test/CodeGen/WebAssembly/wide-simd-mul.ll | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
index 94aa197bfd564..10dc175a9e579 100644
--- a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
+++ b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
@@ -195,3 +195,79 @@ define <8 x i32> @zext_sext_mul_v8i16(<8 x i16> %a, <8 x i16> %b) {
   %mul = mul <8 x i32> %wide.a, %wide.b
   ret <8 x i32> %mul
 }
+
+define <4 x i32> @sext_mul_v8i16_with_symmetric_constant_vector(<8 x i16> %v) {
+; CHECK-LABEL: sext_mul_v8i16_with_symmetric_constant_vector:
+; CHECK:         .functype sext_mul_v8i16_with_symmetric_constant_vector (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push24=, $0
+; CHECK-NEXT:    local.tee $push23=, $1=, $pop24
+; CHECK-NEXT:    i32x4.extract_lane $push8=, $1, 0
+; CHECK-NEXT:    i32.const $push1=, 12
+; CHECK-NEXT:    i32.shl $push9=, $pop8, $pop1
+; CHECK-NEXT:    i32x4.replace_lane $push10=, $pop23, 0, $pop9
+; CHECK-NEXT:    i32x4.extract_lane $push6=, $1, 2
+; CHECK-NEXT:    i32.const $push22=, 12
+; CHECK-NEXT:    i32.shl $push7=, $pop6, $pop22
+; CHECK-NEXT:    i32x4.replace_lane $push21=, $pop10, 2, $pop7
+; CHECK-NEXT:    local.tee $push20=, $1=, $pop21
+; CHECK-NEXT:    i32x4.extend_high_i16x8_s $push19=, $0
+; CHECK-NEXT:    local.tee $push18=, $0=, $pop19
+; CHECK-NEXT:    i32x4.extract_lane $push3=, $0, 0
+; CHECK-NEXT:    i32.const $push17=, 12
+; CHECK-NEXT:    i32.shl $push4=, $pop3, $pop17
+; CHECK-NEXT:    i32x4.replace_lane $push5=, $pop18, 0, $pop4
+; CHECK-NEXT:    i32x4.extract_lane $push0=, $0, 2
+; CHECK-NEXT:    i32.const $push16=, 12
+; CHECK-NEXT:    i32.shl $push2=, $pop0, $pop16
+; CHECK-NEXT:    i32x4.replace_lane $push15=, $pop5, 2, $pop2
+; CHECK-NEXT:    local.tee $push14=, $0=, $pop15
+; CHECK-NEXT:    i8x16.shuffle $push12=, $pop20, $pop14, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i8x16.shuffle $push11=, $1, $0, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add $push13=, $pop12, $pop11
+; CHECK-NEXT:    return $pop13
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %2, %3
+  ret <4 x i32> %4
+}
+
+define <4 x i32> @sext_mul_v8i16_with_constant(<8 x i16> %v) {
+; CHECK-LABEL: sext_mul_v8i16_with_constant:
+; CHECK:         .functype sext_mul_v8i16_with_constant (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push24=, $0
+; CHECK-NEXT:    local.tee $push23=, $1=, $pop24
+; CHECK-NEXT:    i32x4.extract_lane $push8=, $1, 0
+; CHECK-NEXT:    i32.const $push1=, 12
+; CHECK-NEXT:    i32.shl $push9=, $pop8, $pop1
+; CHECK-NEXT:    i32x4.replace_lane $push10=, $pop23, 0, $pop9
+; CHECK-NEXT:    i32x4.extract_lane $push6=, $1, 2
+; CHECK-NEXT:    i32.const $push22=, 12
+; CHECK-NEXT:    i32.shl $push7=, $pop6, $pop22
+; CHECK-NEXT:    i32x4.replace_lane $push21=, $pop10, 2, $pop7
+; CHECK-NEXT:    local.tee $push20=, $1=, $pop21
+; CHECK-NEXT:    i32x4.extend_high_i16x8_s $push19=, $0
+; CHECK-NEXT:    local.tee $push18=, $0=, $pop19
+; CHECK-NEXT:    i32x4.extract_lane $push3=, $0, 0
+; CHECK-NEXT:    i32.const $push17=, 12
+; CHECK-NEXT:    i32.shl $push4=, $pop3, $pop17
+; CHECK-NEXT:    i32x4.replace_lane $push5=, $pop18, 0, $pop4
+; CHECK-NEXT:    i32x4.extract_lane $push0=, $0, 2
+; CHECK-NEXT:    i32.const $push16=, 12
+; CHECK-NEXT:    i32.shl $push2=, $pop0, $pop16
+; CHECK-NEXT:    i32x4.replace_lane $push15=, $pop5, 2, $pop2
+; CHECK-NEXT:    local.tee $push14=, $0=, $pop15
+; CHECK-NEXT:    i8x16.shuffle $push12=, $pop20, $pop14, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i8x16.shuffle $push11=, $1, $0, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add $push13=, $pop12, $pop11
+; CHECK-NEXT:    return $pop13
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %2, %3
+  ret <4 x i32> %4
+}

>From 4e7ce19ed78e71a5e1fab06bec2113f089569fdc Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Sun, 1 Mar 2026 23:06:40 +0900
Subject: [PATCH 2/2] [WebAssembly] Fold extended vector shifts by constant to
 extmul

Vector shifts of extended operands by a constant vector were lowered
into independent extend and shift nodes.
Example: `shl (WebAssemblyISD::EXTEND_LOW_S t1), <12, 0, 12, 0>`

WebAssembly SIMD lacks extended shifts but supports extended
multiplications. Converting the shift constant into a multiplier and
wrapping it in an extend node normalizes the DAG for extmul selection.

The selector matches the mul(ext, ext) structure into extmul, using
explicit undef padding to fulfill the 128-bit register constraint.

Fixed: #179143
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 73 +++++++++++++++++++
 .../test/CodeGen/WebAssembly/wide-simd-mul.ll | 72 +++++-------------
 2 files changed, 93 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index faea931aeccdc..39e33cc76a5ba 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2797,9 +2797,82 @@ static SDValue unrollVectorShift(SDValue Op, SelectionDAG &DAG) {
   return DAG.getBuildVector(Op.getValueType(), DL, UnrolledOps);
 }
 
+/// Convert a vector shift of an extended value into a multiplication of
+/// extended values. By converting the shift amount to a multiplier (1 << C)
+/// and wrapping it in a matching extend node, we enable the instruction
+/// selector to match the pattern to WebAssembly extended multiplication
+/// instructions (e.g., i32x4.extmul_low_i16x8_s). Inactive lanes in the
+/// multiplier vector are populated with undefs.
+///
+/// Example transformation:
+/// Before:
+///   t1: v8i16 = ...
+///   t2: v4i32 = WebAssemblyISD::EXTEND_LOW_S t1
+///   t3: v4i32 = BUILD_VECTOR Constant:i32<12>, Constant:i32<0>, ...
+///   t4: v4i32 = shl t2, t3
+///
+/// After:
+///   t1: v8i16 = ...
+///   t2: v4i32 = WebAssemblyISD::EXTEND_LOW_S t1
+///   t3: v8i16 = BUILD_VECTOR Constant:i16<4096>, Constant:i16<1>, ..., undef, undef
+///   t4: v4i32 = WebAssemblyISD::EXTEND_LOW_S t3
+///   t5: v4i32 = mul t2, t4
+static SDValue foldShiftByConstantToExtMul(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getOpcode() != ISD::SHL || !Op.getValueType().isVector())
+    return SDValue();
+
+  SDValue RHS = Op.getOperand(1);
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  for (SDValue LaneOp : RHS->ops()) {
+    if (!isa<ConstantSDNode>(LaneOp))
+      return SDValue();
+  }
+
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  unsigned ExtOpc = LHS.getOpcode();
+  bool IsLow = false;
+  if (ExtOpc == WebAssemblyISD::EXTEND_LOW_S ||
+      ExtOpc == WebAssemblyISD::EXTEND_HIGH_S) {
+    IsLow = (ExtOpc == WebAssemblyISD::EXTEND_LOW_S);
+  } else if (ExtOpc == WebAssemblyISD::EXTEND_LOW_U ||
+             ExtOpc == WebAssemblyISD::EXTEND_HIGH_U) {
+    IsLow = (ExtOpc == WebAssemblyISD::EXTEND_LOW_U);
+  } else {
+    return SDValue();
+  }
+
+  SDValue SrcVec = LHS.getOperand(0);
+  EVT SrcVecTy = SrcVec.getValueType();
+  unsigned SrcVecEltNum = SrcVecTy.getVectorNumElements();
+  unsigned ConstVecEltNum = SrcVecEltNum / 2;
+  SmallVector<SDValue, 16> MulConsts(SrcVecEltNum,
+                                     DAG.getUNDEF(SrcVecTy.getScalarType()));
+  unsigned StartIdx = IsLow ? 0 : ConstVecEltNum;
+  for (unsigned I = 0; I < ConstVecEltNum; ++I) {
+    auto *C = cast<ConstantSDNode>(RHS.getOperand(I));
+    uint64_t ShiftAmt = C->getZExtValue();
+    if (ShiftAmt >= SrcVecTy.getScalarSizeInBits())
+      return SDValue();
+
+    uint64_t MulAmt = 1ULL << ShiftAmt;
+    MulConsts[StartIdx + I] =
+        DAG.getConstant(MulAmt, DL, SrcVecTy.getScalarType());
+  }
+
+  SDValue ConstVec = DAG.getBuildVector(SrcVecTy, DL, MulConsts);
+  SDValue ExtConstVec = DAG.getNode(ExtOpc, DL, Op.getValueType(), ConstVec);
+
+  return DAG.getNode(ISD::MUL, DL, Op.getValueType(), LHS, ExtConstVec);
+}
+
 SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
+  if (SDValue FoldedExtMul = foldShiftByConstantToExtMul(Op, DAG))
+    return FoldedExtMul;
 
   // Only manually lower vector shifts
   assert(Op.getSimpleValueType().isVector());
diff --git a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
index 10dc175a9e579..28722163d367c 100644
--- a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
+++ b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
@@ -200,32 +200,16 @@ define <4 x i32> @sext_mul_v8i16_with_symmetric_constant_vector(<8 x i16> %v) {
 ; CHECK-LABEL: sext_mul_v8i16_with_symmetric_constant_vector:
 ; CHECK:         .functype sext_mul_v8i16_with_symmetric_constant_vector (v128) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push24=, $0
-; CHECK-NEXT:    local.tee $push23=, $1=, $pop24
-; CHECK-NEXT:    i32x4.extract_lane $push8=, $1, 0
-; CHECK-NEXT:    i32.const $push1=, 12
-; CHECK-NEXT:    i32.shl $push9=, $pop8, $pop1
-; CHECK-NEXT:    i32x4.replace_lane $push10=, $pop23, 0, $pop9
-; CHECK-NEXT:    i32x4.extract_lane $push6=, $1, 2
-; CHECK-NEXT:    i32.const $push22=, 12
-; CHECK-NEXT:    i32.shl $push7=, $pop6, $pop22
-; CHECK-NEXT:    i32x4.replace_lane $push21=, $pop10, 2, $pop7
-; CHECK-NEXT:    local.tee $push20=, $1=, $pop21
-; CHECK-NEXT:    i32x4.extend_high_i16x8_s $push19=, $0
-; CHECK-NEXT:    local.tee $push18=, $0=, $pop19
-; CHECK-NEXT:    i32x4.extract_lane $push3=, $0, 0
-; CHECK-NEXT:    i32.const $push17=, 12
-; CHECK-NEXT:    i32.shl $push4=, $pop3, $pop17
-; CHECK-NEXT:    i32x4.replace_lane $push5=, $pop18, 0, $pop4
-; CHECK-NEXT:    i32x4.extract_lane $push0=, $0, 2
-; CHECK-NEXT:    i32.const $push16=, 12
-; CHECK-NEXT:    i32.shl $push2=, $pop0, $pop16
-; CHECK-NEXT:    i32x4.replace_lane $push15=, $pop5, 2, $pop2
-; CHECK-NEXT:    local.tee $push14=, $0=, $pop15
-; CHECK-NEXT:    i8x16.shuffle $push12=, $pop20, $pop14, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK-NEXT:    i8x16.shuffle $push11=, $1, $0, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK-NEXT:    i32x4.add $push13=, $pop12, $pop11
-; CHECK-NEXT:    return $pop13
+; CHECK-NEXT:    v128.const $push1=, 4096, 1, 4096, 1, 0, 0, 0, 0
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_s $push8=, $0, $pop1
+; CHECK-NEXT:    local.tee $push7=, $1=, $pop8
+; CHECK-NEXT:    v128.const $push0=, 0, 0, 0, 0, 4096, 1, 4096, 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_s $push6=, $0, $pop0
+; CHECK-NEXT:    local.tee $push5=, $0=, $pop6
+; CHECK-NEXT:    i8x16.shuffle $push3=, $pop7, $pop5, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i8x16.shuffle $push2=, $1, $0, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add $push4=, $pop3, $pop2
+; CHECK-NEXT:    return $pop4
   %sext = sext <8 x i16> %v to <8 x i32>
   %1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
   %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -238,32 +222,16 @@ define <4 x i32> @sext_mul_v8i16_with_constant(<8 x i16> %v) {
 ; CHECK-LABEL: sext_mul_v8i16_with_constant:
 ; CHECK:         .functype sext_mul_v8i16_with_constant (v128) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push24=, $0
-; CHECK-NEXT:    local.tee $push23=, $1=, $pop24
-; CHECK-NEXT:    i32x4.extract_lane $push8=, $1, 0
-; CHECK-NEXT:    i32.const $push1=, 12
-; CHECK-NEXT:    i32.shl $push9=, $pop8, $pop1
-; CHECK-NEXT:    i32x4.replace_lane $push10=, $pop23, 0, $pop9
-; CHECK-NEXT:    i32x4.extract_lane $push6=, $1, 2
-; CHECK-NEXT:    i32.const $push22=, 12
-; CHECK-NEXT:    i32.shl $push7=, $pop6, $pop22
-; CHECK-NEXT:    i32x4.replace_lane $push21=, $pop10, 2, $pop7
-; CHECK-NEXT:    local.tee $push20=, $1=, $pop21
-; CHECK-NEXT:    i32x4.extend_high_i16x8_s $push19=, $0
-; CHECK-NEXT:    local.tee $push18=, $0=, $pop19
-; CHECK-NEXT:    i32x4.extract_lane $push3=, $0, 0
-; CHECK-NEXT:    i32.const $push17=, 12
-; CHECK-NEXT:    i32.shl $push4=, $pop3, $pop17
-; CHECK-NEXT:    i32x4.replace_lane $push5=, $pop18, 0, $pop4
-; CHECK-NEXT:    i32x4.extract_lane $push0=, $0, 2
-; CHECK-NEXT:    i32.const $push16=, 12
-; CHECK-NEXT:    i32.shl $push2=, $pop0, $pop16
-; CHECK-NEXT:    i32x4.replace_lane $push15=, $pop5, 2, $pop2
-; CHECK-NEXT:    local.tee $push14=, $0=, $pop15
-; CHECK-NEXT:    i8x16.shuffle $push12=, $pop20, $pop14, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK-NEXT:    i8x16.shuffle $push11=, $1, $0, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK-NEXT:    i32x4.add $push13=, $pop12, $pop11
-; CHECK-NEXT:    return $pop13
+; CHECK-NEXT:    v128.const $push1=, 4096, 1, 4096, 1, 0, 0, 0, 0
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_s $push8=, $0, $pop1
+; CHECK-NEXT:    local.tee $push7=, $1=, $pop8
+; CHECK-NEXT:    v128.const $push0=, 0, 0, 0, 0, 4096, 1, 4096, 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_s $push6=, $0, $pop0
+; CHECK-NEXT:    local.tee $push5=, $0=, $pop6
+; CHECK-NEXT:    i8x16.shuffle $push3=, $pop7, $pop5, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i8x16.shuffle $push2=, $1, $0, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add $push4=, $pop3, $pop2
+; CHECK-NEXT:    return $pop4
   %sext = sext <8 x i16> %v to <8 x i32>
   %1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
   %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>