[llvm-branch-commits] [llvm] release/22.x: [WebAssembly] narrow instructions use signed saturation (#201798) (PR #201909)

Fri Jun 5 11:53:09 PDT 2026

llvmorg-github-actions[bot] wrote:




@llvm/pr-subscribers-backend-webassembly

Author: llvmbot

<details>
<summary>Changes</summary>

Backport c19fa5be5f47d8747f523509382b12997f2bd25f

Requested by: @nikic

---
Full diff: https://github.com/llvm/llvm-project/pull/201909.diff


3 Files Affected:

- (modified) llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td (+17-3) 
- (modified) llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll (+14-24) 
- (modified) llvm/test/CodeGen/WebAssembly/saturating-truncation.ll (+70-6) 


``````````diff

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 72feb5492c67a..c69cd03f97b78 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1504,12 +1504,26 @@ multiclass SignedSaturatingTruncate<ValueType input, ValueType output,
 defm : SignedSaturatingTruncate<v8i16, v16i8, NARROW_S_I8x16, -128, 127, 0xFF>;
 defm : SignedSaturatingTruncate<v4i32, v8i16, NARROW_S_I16x8, -32768, 32767, 0xFFFF>;
 
+// NOTE: the saturating is actually signed, the truncation is unsigned, see
+// https://www.w3.org/TR/wasm-core-2/#-hrefop-narrowmathrmnarrowmathsfu_m-n-i
 multiclass UnsignedSaturatingTruncate<ValueType input, ValueType output,
-                                    Instruction narrow, int maxval> {
+                                      Instruction narrow, int maxval> {
   def : Pat<
     (output (wasm_narrow_u
-      (umin (input V128:$a), (splat_vector (i32 maxval))),
-      (umin (input V128:$b), (splat_vector (i32 maxval)))
+      (smin (smax (input V128:$a), (splat_vector (i32 0))),
+            (splat_vector (i32 maxval))),
+      (smin (smax (input V128:$b), (splat_vector (i32 0))),
+            (splat_vector (i32 maxval)))
+    )),
+    (narrow V128:$a, V128:$b)
+  >;
+
+  def : Pat<
+    (output (wasm_narrow_u
+      (smax (smin (input V128:$a), (splat_vector (i32 maxval))),
+            (splat_vector (i32 0))),
+      (smax (smin (input V128:$b), (splat_vector (i32 maxval))),
+            (splat_vector (i32 0)))
     )),
     (narrow V128:$a, V128:$b)
   >;
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
index 55409d5b2d8c3..f77443e2725c2 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
@@ -484,7 +484,7 @@ entry:
 define <8 x i16> @utest_f16i16(<8 x half> %x) {
 ; CHECK-LABEL: utest_f16i16:
 ; CHECK:         .functype utest_f16i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
-; CHECK-NEXT:    .local f32, f32, f32, f32, f32
+; CHECK-NEXT:    .local f32, f32, f32, f32, f32, v128
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
 ; CHECK-NEXT:    call __extendhfsf2
@@ -516,6 +516,9 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.replace_lane 3
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 13
+; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    local.get 9
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.splat
@@ -528,6 +531,8 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    local.get 11
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.replace_lane 3
+; CHECK-NEXT:    local.get 13
+; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    i16x8.narrow_i32x4_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
@@ -541,7 +546,7 @@ entry:
 define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-LABEL: ustest_f16i16:
 ; CHECK:         .functype ustest_f16i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
-; CHECK-NEXT:    .local f32, f32, f32, f32, f32, v128, v128
+; CHECK-NEXT:    .local f32, f32, f32, f32, f32
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
 ; CHECK-NEXT:    call __extendhfsf2
@@ -573,12 +578,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.tee 13
-; CHECK-NEXT:    i32x4.min_s
-; CHECK-NEXT:    v128.const 0, 0, 0, 0
-; CHECK-NEXT:    local.tee 14
-; CHECK-NEXT:    i32x4.max_s
 ; CHECK-NEXT:    local.get 9
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.splat
@@ -591,10 +590,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    local.get 11
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    local.get 13
-; CHECK-NEXT:    i32x4.min_s
-; CHECK-NEXT:    local.get 14
-; CHECK-NEXT:    i32x4.max_s
 ; CHECK-NEXT:    i16x8.narrow_i32x4_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
@@ -1850,7 +1845,7 @@ entry:
 define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
 ; CHECK-LABEL: utest_f16i16_mm:
 ; CHECK:         .functype utest_f16i16_mm (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
-; CHECK-NEXT:    .local f32, f32, f32, f32, f32
+; CHECK-NEXT:    .local f32, f32, f32, f32, f32, v128
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
 ; CHECK-NEXT:    call __extendhfsf2
@@ -1882,6 +1877,9 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.replace_lane 3
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 13
+; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    local.get 9
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.splat
@@ -1894,6 +1892,8 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    local.get 11
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.replace_lane 3
+; CHECK-NEXT:    local.get 13
+; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    i16x8.narrow_i32x4_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
@@ -1906,7 +1906,7 @@ entry:
 define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-LABEL: ustest_f16i16_mm:
 ; CHECK:         .functype ustest_f16i16_mm (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
-; CHECK-NEXT:    .local f32, f32, f32, f32, f32, v128, v128
+; CHECK-NEXT:    .local f32, f32, f32, f32, f32
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
 ; CHECK-NEXT:    call __extendhfsf2
@@ -1938,12 +1938,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.tee 13
-; CHECK-NEXT:    i32x4.min_s
-; CHECK-NEXT:    v128.const 0, 0, 0, 0
-; CHECK-NEXT:    local.tee 14
-; CHECK-NEXT:    i32x4.max_s
 ; CHECK-NEXT:    local.get 9
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.splat
@@ -1956,10 +1950,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    local.get 11
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    local.get 13
-; CHECK-NEXT:    i32x4.min_s
-; CHECK-NEXT:    local.get 14
-; CHECK-NEXT:    i32x4.max_s
 ; CHECK-NEXT:    i16x8.narrow_i32x4_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
diff --git a/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll b/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll
index f3f3ba9b268d7..6070dfa5bed76 100644
--- a/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll
+++ b/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll
@@ -56,12 +56,19 @@ bb2:
   ret <8 x i16> %3
 }
 
-define <16 x i8> @i16_unsigned(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: i16_unsigned:
-; CHECK:         .functype i16_unsigned (v128, v128) -> (v128)
+; NOTE: unsigned narrow uses *signed* saturation, the manual unsigned saturation cannot be optimized out.
+define <16 x i8> @i16_unsigned_sat_unsigned_truncate(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: i16_unsigned_sat_unsigned_truncate:
+; CHECK:         .functype i16_unsigned_sat_unsigned_truncate (v128, v128) -> (v128)
+; CHECK-NEXT:    .local v128
 ; CHECK-NEXT:  # %bb.0: # %bb2
 ; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    i16x8.min_u
 ; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i16x8.min_u
 ; CHECK-NEXT:    i8x16.narrow_i16x8_u
 ; CHECK-NEXT:    # fallthrough-return
 bb2:
@@ -71,12 +78,19 @@ bb2:
   ret <16 x i8> %2
 }
 
-define <8 x i16> @i32_unsigned(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: i32_unsigned:
-; CHECK:         .functype i32_unsigned (v128, v128) -> (v128)
+; NOTE: unsigned narrow uses *signed* saturation, the manual unsigned saturation cannot be optimized out.
+define <8 x i16> @i32_unsigned_sat_unsigned_truncate(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: i32_unsigned_sat_unsigned_truncate:
+; CHECK:         .functype i32_unsigned_sat_unsigned_truncate (v128, v128) -> (v128)
+; CHECK-NEXT:    .local v128
 ; CHECK-NEXT:  # %bb.0: # %bb2
 ; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    i16x8.narrow_i32x4_u
 ; CHECK-NEXT:    # fallthrough-return
 bb2:
@@ -85,3 +99,53 @@ bb2:
   %2 = trunc nsw <8 x i32> %1 to <8 x i16>
   ret <8 x i16> %2
 }
+
+; NOTE: narrow_i16x8_u uses *signed* saturation, the manual unsigned saturation cannot be optimized out.
+define <16 x i8> @narrow_with_manual_unsigned_sat(<8 x i16> %a) {
+; CHECK-LABEL: narrow_with_manual_unsigned_sat:
+; CHECK:         .functype narrow_with_manual_unsigned_sat (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
+; CHECK-NEXT:    i16x8.min_u
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+start:
+  %0 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 255))
+  %_21 = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> %0, <8 x i16> %0)
+  ret <16 x i8> %_21
+}
+
+define <16 x i8> @i16_signed_sat_unsigned_truncate(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: i16_signed_sat_unsigned_truncate:
+; CHECK:         .functype i16_signed_sat_unsigned_truncate (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %bb2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+bb2:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %0, <16 x i16> zeroinitializer)
+  %2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 255))
+  %3 = trunc nuw <16 x i16> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @i32_signed_sat_unsigned_truncate(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: i32_signed_sat_unsigned_truncate:
+; CHECK:         .functype i32_signed_sat_unsigned_truncate (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %bb2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+bb2:
+  %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %0, <8 x i32> splat (i32 65535))
+  %2 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %1, <8 x i32> zeroinitializer)
+  %3 = trunc nuw <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/201909