[llvm] [WebAssembly] Add support for avgr_u in loops (PR #153252)

Jasmine Tang via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 21 10:40:17 PDT 2025


https://github.com/badumbatish updated https://github.com/llvm/llvm-project/pull/153252

>From 4a82aea026ede56dfe5010fb4310fec61cf41d63 Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Tue, 12 Aug 2025 11:53:15 -0700
Subject: [PATCH 1/5] Precommit test for avgr pattern

---
 llvm/test/CodeGen/WebAssembly/simd-avgr.ll | 230 +++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 llvm/test/CodeGen/WebAssembly/simd-avgr.ll

diff --git a/llvm/test/CodeGen/WebAssembly/simd-avgr.ll b/llvm/test/CodeGen/WebAssembly/simd-avgr.ll
new file mode 100644
index 0000000000000..ac49821a57966
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-avgr.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O2 -mtriple=wasm32 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
+
+;void f(unsigned char *x, unsigned char *y, int n) {
+;  for (int i = 0; i < n; i++)
+;    x[i] = (x[i] + y[i] + 1) / 2;
+;}
+
+define void @f(ptr %x, ptr %y, i32 %n) {
+; CHECK-LABEL: f:
+; CHECK:         .functype f (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    i32.const $push0=, 1
+; CHECK-NEXT:    i32.lt_s $push1=, $2, $pop0
+; CHECK-NEXT:    br_if 0, $pop1 # 0: down to label0
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    i32.const $5=, 0
+; CHECK-NEXT:    block
+; CHECK-NEXT:    i32.const $push2=, 16
+; CHECK-NEXT:    i32.lt_u $push3=, $2, $pop2
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label1
+; CHECK-NEXT:  # %bb.2: # %vector.memcheck
+; CHECK-NEXT:    block
+; CHECK-NEXT:    i32.add $push5=, $1, $2
+; CHECK-NEXT:    i32.ge_u $push6=, $0, $pop5
+; CHECK-NEXT:    br_if 0, $pop6 # 0: down to label2
+; CHECK-NEXT:  # %bb.3: # %vector.memcheck
+; CHECK-NEXT:    i32.add $push4=, $0, $2
+; CHECK-NEXT:    i32.lt_u $push7=, $1, $pop4
+; CHECK-NEXT:    br_if 1, $pop7 # 1: down to label1
+; CHECK-NEXT:  .LBB0_4: # %vector.ph
+; CHECK-NEXT:    end_block # label2:
+; CHECK-NEXT:    local.copy $6=, $0
+; CHECK-NEXT:    local.copy $7=, $1
+; CHECK-NEXT:    i32.const $push8=, 2147483632
+; CHECK-NEXT:    i32.and $push34=, $2, $pop8
+; CHECK-NEXT:    local.tee $push33=, $5=, $pop34
+; CHECK-NEXT:    local.copy $8=, $pop33
+; CHECK-NEXT:  .LBB0_5: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    loop # label3:
+; CHECK-NEXT:    v128.load $push44=, 0($6):p2align=0
+; CHECK-NEXT:    local.tee $push43=, $4=, $pop44
+; CHECK-NEXT:    v128.load $push42=, 0($7):p2align=0
+; CHECK-NEXT:    local.tee $push41=, $3=, $pop42
+; CHECK-NEXT:    v128.or $push9=, $pop43, $pop41
+; CHECK-NEXT:    v128.xor $push10=, $4, $3
+; CHECK-NEXT:    i32.const $push40=, 1
+; CHECK-NEXT:    i8x16.shr_u $push11=, $pop10, $pop40
+; CHECK-NEXT:    i8x16.sub $push12=, $pop9, $pop11
+; CHECK-NEXT:    v128.store 0($6):p2align=0, $pop12
+; CHECK-NEXT:    i32.const $push39=, 16
+; CHECK-NEXT:    i32.add $6=, $6, $pop39
+; CHECK-NEXT:    i32.const $push38=, 16
+; CHECK-NEXT:    i32.add $7=, $7, $pop38
+; CHECK-NEXT:    i32.const $push37=, -16
+; CHECK-NEXT:    i32.add $push36=, $8, $pop37
+; CHECK-NEXT:    local.tee $push35=, $8=, $pop36
+; CHECK-NEXT:    br_if 0, $pop35 # 0: up to label3
+; CHECK-NEXT:  # %bb.6: # %middle.block
+; CHECK-NEXT:    end_loop
+; CHECK-NEXT:    i32.eq $push13=, $2, $5
+; CHECK-NEXT:    br_if 1, $pop13 # 1: down to label0
+; CHECK-NEXT:  .LBB0_7: # %for.body.preheader16
+; CHECK-NEXT:    end_block # label1:
+; CHECK-NEXT:    i32.const $push46=, 1
+; CHECK-NEXT:    i32.or $6=, $5, $pop46
+; CHECK-NEXT:    block
+; CHECK-NEXT:    i32.const $push45=, 1
+; CHECK-NEXT:    i32.and $push14=, $2, $pop45
+; CHECK-NEXT:    i32.eqz $push64=, $pop14
+; CHECK-NEXT:    br_if 0, $pop64 # 0: down to label4
+; CHECK-NEXT:  # %bb.8: # %for.body.prol
+; CHECK-NEXT:    i32.add $push50=, $0, $5
+; CHECK-NEXT:    local.tee $push49=, $7=, $pop50
+; CHECK-NEXT:    i32.load8_u $push17=, 0($7)
+; CHECK-NEXT:    i32.add $push15=, $1, $5
+; CHECK-NEXT:    i32.load8_u $push16=, 0($pop15)
+; CHECK-NEXT:    i32.add $push18=, $pop17, $pop16
+; CHECK-NEXT:    i32.const $push48=, 1
+; CHECK-NEXT:    i32.add $push19=, $pop18, $pop48
+; CHECK-NEXT:    i32.const $push47=, 1
+; CHECK-NEXT:    i32.shr_u $push20=, $pop19, $pop47
+; CHECK-NEXT:    i32.store8 0($pop49), $pop20
+; CHECK-NEXT:    local.copy $5=, $6
+; CHECK-NEXT:  .LBB0_9: # %for.body.prol.loopexit
+; CHECK-NEXT:    end_block # label4:
+; CHECK-NEXT:    i32.eq $push21=, $2, $6
+; CHECK-NEXT:    br_if 0, $pop21 # 0: down to label0
+; CHECK-NEXT:  # %bb.10: # %for.body.preheader1
+; CHECK-NEXT:    i32.add $6=, $0, $5
+; CHECK-NEXT:    i32.add $7=, $1, $5
+; CHECK-NEXT:    i32.sub $8=, $2, $5
+; CHECK-NEXT:  .LBB0_11: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    loop # label5:
+; CHECK-NEXT:    i32.load8_u $push23=, 0($6)
+; CHECK-NEXT:    i32.load8_u $push22=, 0($7)
+; CHECK-NEXT:    i32.add $push24=, $pop23, $pop22
+; CHECK-NEXT:    i32.const $push63=, 1
+; CHECK-NEXT:    i32.add $push25=, $pop24, $pop63
+; CHECK-NEXT:    i32.const $push62=, 1
+; CHECK-NEXT:    i32.shr_u $push26=, $pop25, $pop62
+; CHECK-NEXT:    i32.store8 0($6), $pop26
+; CHECK-NEXT:    i32.const $push61=, 1
+; CHECK-NEXT:    i32.add $push60=, $6, $pop61
+; CHECK-NEXT:    local.tee $push59=, $2=, $pop60
+; CHECK-NEXT:    i32.load8_u $push27=, 0($2)
+; CHECK-NEXT:    i32.const $push58=, 1
+; CHECK-NEXT:    i32.add $push28=, $7, $pop58
+; CHECK-NEXT:    i32.load8_u $push29=, 0($pop28)
+; CHECK-NEXT:    i32.add $push30=, $pop27, $pop29
+; CHECK-NEXT:    i32.const $push57=, 1
+; CHECK-NEXT:    i32.add $push31=, $pop30, $pop57
+; CHECK-NEXT:    i32.const $push56=, 1
+; CHECK-NEXT:    i32.shr_u $push32=, $pop31, $pop56
+; CHECK-NEXT:    i32.store8 0($pop59), $pop32
+; CHECK-NEXT:    i32.const $push55=, 2
+; CHECK-NEXT:    i32.add $6=, $6, $pop55
+; CHECK-NEXT:    i32.const $push54=, 2
+; CHECK-NEXT:    i32.add $7=, $7, $pop54
+; CHECK-NEXT:    i32.const $push53=, -2
+; CHECK-NEXT:    i32.add $push52=, $8, $pop53
+; CHECK-NEXT:    local.tee $push51=, $8=, $pop52
+; CHECK-NEXT:    br_if 0, $pop51 # 0: up to label5
+; CHECK-NEXT:  .LBB0_12: # %for.cond.cleanup
+; CHECK-NEXT:    end_loop
+; CHECK-NEXT:    end_block # label0:
+; CHECK-NEXT:    return
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %min.iters.check = icmp ult i32 %n, 16
+  br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck
+
+vector.memcheck:
+  %scevgep = getelementptr i8, ptr %x, i32 %n
+  %scevgep14 = getelementptr i8, ptr %y, i32 %n
+  %bound0 = icmp ult ptr %x, %scevgep14
+  %bound1 = icmp ult ptr %y, %scevgep
+  %found.conflict = and i1 %bound0, %bound1
+  br i1 %found.conflict, label %for.body.preheader16, label %vector.ph
+
+vector.ph:
+  %n.vec = and i32 %n, 2147483632
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds nuw i8, ptr %x, i32 %index
+  %wide.load = load <16 x i8>, ptr %0, align 1
+  %1 = zext <16 x i8> %wide.load to <16 x i16>
+  %2 = getelementptr inbounds nuw i8, ptr %y, i32 %index
+  %wide.load15 = load <16 x i8>, ptr %2, align 1
+  %3 = zext <16 x i8> %wide.load15 to <16 x i16>
+  %4 = add nuw nsw <16 x i16> %1, splat (i16 1)
+  %5 = add nuw nsw <16 x i16> %4, %3
+  %6 = lshr <16 x i16> %5, splat (i16 1)
+  %7 = trunc nuw <16 x i16> %6 to <16 x i8>
+  store <16 x i8> %7, ptr %0, align 1
+  %index.next = add nuw i32 %index, 16
+  %8 = icmp eq i32 %index.next, %n.vec
+  br i1 %8, label %middle.block, label %vector.body
+
+middle.block:
+  %cmp.n = icmp eq i32 %n, %n.vec
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16
+
+for.body.preheader16:
+  %i.013.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %.neg = or disjoint i32 %i.013.ph, 1
+  %xtraiter = and i32 %n, 1
+  %lcmp.mod.not = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod.not, label %for.body.prol.loopexit, label %for.body.prol
+
+for.body.prol:
+  %arrayidx.prol = getelementptr inbounds nuw i8, ptr %x, i32 %i.013.ph
+  %9 = load i8, ptr %arrayidx.prol, align 1
+  %conv.prol = zext i8 %9 to i16
+  %arrayidx1.prol = getelementptr inbounds nuw i8, ptr %y, i32 %i.013.ph
+  %10 = load i8, ptr %arrayidx1.prol, align 1
+  %conv2.prol = zext i8 %10 to i16
+  %add.prol = add nuw nsw i16 %conv.prol, 1
+  %add3.prol = add nuw nsw i16 %add.prol, %conv2.prol
+  %div11.prol = lshr i16 %add3.prol, 1
+  %conv4.prol = trunc nuw i16 %div11.prol to i8
+  store i8 %conv4.prol, ptr %arrayidx.prol, align 1
+  %inc.prol = or disjoint i32 %i.013.ph, 1
+  br label %for.body.prol.loopexit
+
+for.body.prol.loopexit:
+  %i.013.unr = phi i32 [ %i.013.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ]
+  %11 = icmp eq i32 %n, %.neg
+  br i1 %11, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.013 = phi i32 [ %inc.1, %for.body ], [ %i.013.unr, %for.body.prol.loopexit ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %x, i32 %i.013
+  %12 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %12 to i16
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %y, i32 %i.013
+  %13 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %13 to i16
+  %add = add nuw nsw i16 %conv, 1
+  %add3 = add nuw nsw i16 %add, %conv2
+  %div11 = lshr i16 %add3, 1
+  %conv4 = trunc nuw i16 %div11 to i8
+  store i8 %conv4, ptr %arrayidx, align 1
+  %inc = add nuw nsw i32 %i.013, 1
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i32 %inc
+  %14 = load i8, ptr %arrayidx.1, align 1
+  %conv.1 = zext i8 %14 to i16
+  %arrayidx1.1 = getelementptr inbounds nuw i8, ptr %y, i32 %inc
+  %15 = load i8, ptr %arrayidx1.1, align 1
+  %conv2.1 = zext i8 %15 to i16
+  %add.1 = add nuw nsw i16 %conv.1, 1
+  %add3.1 = add nuw nsw i16 %add.1, %conv2.1
+  %div11.1 = lshr i16 %add3.1, 1
+  %conv4.1 = trunc nuw i16 %div11.1 to i8
+  store i8 %conv4.1, ptr %arrayidx.1, align 1
+  %inc.1 = add nuw nsw i32 %i.013, 2
+  %exitcond.not.1 = icmp eq i32 %inc.1, %n
+  br i1 %exitcond.not.1, label %for.cond.cleanup, label %for.body
+}

>From fe2ee69d6882aefd0e70e94c38e31991fdec57bd Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Tue, 12 Aug 2025 12:00:31 -0700
Subject: [PATCH 2/5] Support avgr_u in loop construct

---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |   4 +
 .../WebAssembly/WebAssemblyInstrSIMD.td       |   3 +
 llvm/test/CodeGen/WebAssembly/simd-avgr.ll    | 144 +++++++++---------
 3 files changed, 76 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 3f80b2ab2bd6d..4299313c28802 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -246,6 +246,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
                    MVT::v2f64})
       setOperationAction(ISD::SPLAT_VECTOR, T, Legal);
 
+    // Set avgceilu as legal for i8x16 and i16x8
+    // and isel will convert to AVGR_U w/ tablegen
+    setOperationAction({ISD::AVGCEILU}, {MVT::v8i16, MVT::v16i8}, Legal);
+
     // Custom lowering since wasm shifts must have a scalar shift amount
     for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
       for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64})
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 143298b700928..fb508e3dc9a7f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1144,6 +1144,9 @@ def : Pat<(wasm_shr_u
               (vec.splat (i32 1))),
             (i32 1)),
           (inst $lhs, $rhs)>;
+
+def : Pat<(vec.vt(avgceilu(vec.vt V128:$lhs), (vec.vt V128:$rhs))), (inst $lhs,
+                                                                        $rhs)>;
 }
 
 // Widening dot product: i32x4.dot_i16x8_s
diff --git a/llvm/test/CodeGen/WebAssembly/simd-avgr.ll b/llvm/test/CodeGen/WebAssembly/simd-avgr.ll
index ac49821a57966..c4e6b12244e9b 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-avgr.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-avgr.ll
@@ -15,7 +15,7 @@ define void @f(ptr %x, ptr %y, i32 %n) {
 ; CHECK-NEXT:    i32.lt_s $push1=, $2, $pop0
 ; CHECK-NEXT:    br_if 0, $pop1 # 0: down to label0
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    i32.const $5=, 0
+; CHECK-NEXT:    i32.const $3=, 0
 ; CHECK-NEXT:    block
 ; CHECK-NEXT:    i32.const $push2=, 16
 ; CHECK-NEXT:    i32.lt_u $push3=, $2, $pop2
@@ -31,99 +31,93 @@ define void @f(ptr %x, ptr %y, i32 %n) {
 ; CHECK-NEXT:    br_if 1, $pop7 # 1: down to label1
 ; CHECK-NEXT:  .LBB0_4: # %vector.ph
 ; CHECK-NEXT:    end_block # label2:
-; CHECK-NEXT:    local.copy $6=, $0
-; CHECK-NEXT:    local.copy $7=, $1
+; CHECK-NEXT:    local.copy $4=, $0
+; CHECK-NEXT:    local.copy $5=, $1
 ; CHECK-NEXT:    i32.const $push8=, 2147483632
-; CHECK-NEXT:    i32.and $push34=, $2, $pop8
-; CHECK-NEXT:    local.tee $push33=, $5=, $pop34
-; CHECK-NEXT:    local.copy $8=, $pop33
+; CHECK-NEXT:    i32.and $push33=, $2, $pop8
+; CHECK-NEXT:    local.tee $push32=, $3=, $pop33
+; CHECK-NEXT:    local.copy $6=, $pop32
 ; CHECK-NEXT:  .LBB0_5: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    loop # label3:
-; CHECK-NEXT:    v128.load $push44=, 0($6):p2align=0
-; CHECK-NEXT:    local.tee $push43=, $4=, $pop44
-; CHECK-NEXT:    v128.load $push42=, 0($7):p2align=0
-; CHECK-NEXT:    local.tee $push41=, $3=, $pop42
-; CHECK-NEXT:    v128.or $push9=, $pop43, $pop41
-; CHECK-NEXT:    v128.xor $push10=, $4, $3
-; CHECK-NEXT:    i32.const $push40=, 1
-; CHECK-NEXT:    i8x16.shr_u $push11=, $pop10, $pop40
-; CHECK-NEXT:    i8x16.sub $push12=, $pop9, $pop11
-; CHECK-NEXT:    v128.store 0($6):p2align=0, $pop12
-; CHECK-NEXT:    i32.const $push39=, 16
-; CHECK-NEXT:    i32.add $6=, $6, $pop39
+; CHECK-NEXT:    v128.load $push10=, 0($4):p2align=0
+; CHECK-NEXT:    v128.load $push9=, 0($5):p2align=0
+; CHECK-NEXT:    i8x16.avgr_u $push11=, $pop10, $pop9
+; CHECK-NEXT:    v128.store 0($4):p2align=0, $pop11
 ; CHECK-NEXT:    i32.const $push38=, 16
-; CHECK-NEXT:    i32.add $7=, $7, $pop38
-; CHECK-NEXT:    i32.const $push37=, -16
-; CHECK-NEXT:    i32.add $push36=, $8, $pop37
-; CHECK-NEXT:    local.tee $push35=, $8=, $pop36
-; CHECK-NEXT:    br_if 0, $pop35 # 0: up to label3
+; CHECK-NEXT:    i32.add $4=, $4, $pop38
+; CHECK-NEXT:    i32.const $push37=, 16
+; CHECK-NEXT:    i32.add $5=, $5, $pop37
+; CHECK-NEXT:    i32.const $push36=, -16
+; CHECK-NEXT:    i32.add $push35=, $6, $pop36
+; CHECK-NEXT:    local.tee $push34=, $6=, $pop35
+; CHECK-NEXT:    br_if 0, $pop34 # 0: up to label3
 ; CHECK-NEXT:  # %bb.6: # %middle.block
 ; CHECK-NEXT:    end_loop
-; CHECK-NEXT:    i32.eq $push13=, $2, $5
-; CHECK-NEXT:    br_if 1, $pop13 # 1: down to label0
+; CHECK-NEXT:    i32.eq $push12=, $2, $3
+; CHECK-NEXT:    br_if 1, $pop12 # 1: down to label0
 ; CHECK-NEXT:  .LBB0_7: # %for.body.preheader16
 ; CHECK-NEXT:    end_block # label1:
-; CHECK-NEXT:    i32.const $push46=, 1
-; CHECK-NEXT:    i32.or $6=, $5, $pop46
+; CHECK-NEXT:    i32.const $push40=, 1
+; CHECK-NEXT:    i32.or $4=, $3, $pop40
 ; CHECK-NEXT:    block
-; CHECK-NEXT:    i32.const $push45=, 1
-; CHECK-NEXT:    i32.and $push14=, $2, $pop45
-; CHECK-NEXT:    i32.eqz $push64=, $pop14
-; CHECK-NEXT:    br_if 0, $pop64 # 0: down to label4
+; CHECK-NEXT:    i32.const $push39=, 1
+; CHECK-NEXT:    i32.and $push13=, $2, $pop39
+; CHECK-NEXT:    i32.eqz $push58=, $pop13
+; CHECK-NEXT:    br_if 0, $pop58 # 0: down to label4
 ; CHECK-NEXT:  # %bb.8: # %for.body.prol
-; CHECK-NEXT:    i32.add $push50=, $0, $5
-; CHECK-NEXT:    local.tee $push49=, $7=, $pop50
-; CHECK-NEXT:    i32.load8_u $push17=, 0($7)
-; CHECK-NEXT:    i32.add $push15=, $1, $5
-; CHECK-NEXT:    i32.load8_u $push16=, 0($pop15)
-; CHECK-NEXT:    i32.add $push18=, $pop17, $pop16
-; CHECK-NEXT:    i32.const $push48=, 1
-; CHECK-NEXT:    i32.add $push19=, $pop18, $pop48
-; CHECK-NEXT:    i32.const $push47=, 1
-; CHECK-NEXT:    i32.shr_u $push20=, $pop19, $pop47
-; CHECK-NEXT:    i32.store8 0($pop49), $pop20
-; CHECK-NEXT:    local.copy $5=, $6
+; CHECK-NEXT:    i32.add $push44=, $0, $3
+; CHECK-NEXT:    local.tee $push43=, $5=, $pop44
+; CHECK-NEXT:    i32.load8_u $push16=, 0($5)
+; CHECK-NEXT:    i32.add $push14=, $1, $3
+; CHECK-NEXT:    i32.load8_u $push15=, 0($pop14)
+; CHECK-NEXT:    i32.add $push17=, $pop16, $pop15
+; CHECK-NEXT:    i32.const $push42=, 1
+; CHECK-NEXT:    i32.add $push18=, $pop17, $pop42
+; CHECK-NEXT:    i32.const $push41=, 1
+; CHECK-NEXT:    i32.shr_u $push19=, $pop18, $pop41
+; CHECK-NEXT:    i32.store8 0($pop43), $pop19
+; CHECK-NEXT:    local.copy $3=, $4
 ; CHECK-NEXT:  .LBB0_9: # %for.body.prol.loopexit
 ; CHECK-NEXT:    end_block # label4:
-; CHECK-NEXT:    i32.eq $push21=, $2, $6
-; CHECK-NEXT:    br_if 0, $pop21 # 0: down to label0
+; CHECK-NEXT:    i32.eq $push20=, $2, $4
+; CHECK-NEXT:    br_if 0, $pop20 # 0: down to label0
 ; CHECK-NEXT:  # %bb.10: # %for.body.preheader1
-; CHECK-NEXT:    i32.add $6=, $0, $5
-; CHECK-NEXT:    i32.add $7=, $1, $5
-; CHECK-NEXT:    i32.sub $8=, $2, $5
+; CHECK-NEXT:    i32.add $4=, $0, $3
+; CHECK-NEXT:    i32.add $5=, $1, $3
+; CHECK-NEXT:    i32.sub $6=, $2, $3
 ; CHECK-NEXT:  .LBB0_11: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    loop # label5:
-; CHECK-NEXT:    i32.load8_u $push23=, 0($6)
-; CHECK-NEXT:    i32.load8_u $push22=, 0($7)
-; CHECK-NEXT:    i32.add $push24=, $pop23, $pop22
-; CHECK-NEXT:    i32.const $push63=, 1
-; CHECK-NEXT:    i32.add $push25=, $pop24, $pop63
-; CHECK-NEXT:    i32.const $push62=, 1
-; CHECK-NEXT:    i32.shr_u $push26=, $pop25, $pop62
-; CHECK-NEXT:    i32.store8 0($6), $pop26
-; CHECK-NEXT:    i32.const $push61=, 1
-; CHECK-NEXT:    i32.add $push60=, $6, $pop61
-; CHECK-NEXT:    local.tee $push59=, $2=, $pop60
-; CHECK-NEXT:    i32.load8_u $push27=, 0($2)
-; CHECK-NEXT:    i32.const $push58=, 1
-; CHECK-NEXT:    i32.add $push28=, $7, $pop58
-; CHECK-NEXT:    i32.load8_u $push29=, 0($pop28)
-; CHECK-NEXT:    i32.add $push30=, $pop27, $pop29
+; CHECK-NEXT:    i32.load8_u $push22=, 0($4)
+; CHECK-NEXT:    i32.load8_u $push21=, 0($5)
+; CHECK-NEXT:    i32.add $push23=, $pop22, $pop21
 ; CHECK-NEXT:    i32.const $push57=, 1
-; CHECK-NEXT:    i32.add $push31=, $pop30, $pop57
+; CHECK-NEXT:    i32.add $push24=, $pop23, $pop57
 ; CHECK-NEXT:    i32.const $push56=, 1
-; CHECK-NEXT:    i32.shr_u $push32=, $pop31, $pop56
-; CHECK-NEXT:    i32.store8 0($pop59), $pop32
-; CHECK-NEXT:    i32.const $push55=, 2
-; CHECK-NEXT:    i32.add $6=, $6, $pop55
-; CHECK-NEXT:    i32.const $push54=, 2
-; CHECK-NEXT:    i32.add $7=, $7, $pop54
-; CHECK-NEXT:    i32.const $push53=, -2
-; CHECK-NEXT:    i32.add $push52=, $8, $pop53
-; CHECK-NEXT:    local.tee $push51=, $8=, $pop52
-; CHECK-NEXT:    br_if 0, $pop51 # 0: up to label5
+; CHECK-NEXT:    i32.shr_u $push25=, $pop24, $pop56
+; CHECK-NEXT:    i32.store8 0($4), $pop25
+; CHECK-NEXT:    i32.const $push55=, 1
+; CHECK-NEXT:    i32.add $push54=, $4, $pop55
+; CHECK-NEXT:    local.tee $push53=, $2=, $pop54
+; CHECK-NEXT:    i32.load8_u $push26=, 0($2)
+; CHECK-NEXT:    i32.const $push52=, 1
+; CHECK-NEXT:    i32.add $push27=, $5, $pop52
+; CHECK-NEXT:    i32.load8_u $push28=, 0($pop27)
+; CHECK-NEXT:    i32.add $push29=, $pop26, $pop28
+; CHECK-NEXT:    i32.const $push51=, 1
+; CHECK-NEXT:    i32.add $push30=, $pop29, $pop51
+; CHECK-NEXT:    i32.const $push50=, 1
+; CHECK-NEXT:    i32.shr_u $push31=, $pop30, $pop50
+; CHECK-NEXT:    i32.store8 0($pop53), $pop31
+; CHECK-NEXT:    i32.const $push49=, 2
+; CHECK-NEXT:    i32.add $4=, $4, $pop49
+; CHECK-NEXT:    i32.const $push48=, 2
+; CHECK-NEXT:    i32.add $5=, $5, $pop48
+; CHECK-NEXT:    i32.const $push47=, -2
+; CHECK-NEXT:    i32.add $push46=, $6, $pop47
+; CHECK-NEXT:    local.tee $push45=, $6=, $pop46
+; CHECK-NEXT:    br_if 0, $pop45 # 0: up to label5
 ; CHECK-NEXT:  .LBB0_12: # %for.cond.cleanup
 ; CHECK-NEXT:    end_loop
 ; CHECK-NEXT:    end_block # label0:

>From b2345ea959e4a37a0fecf505a80a26a337361c8b Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Thu, 14 Aug 2025 14:15:51 -0700
Subject: [PATCH 3/5] Address PR reviews

---
 llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4299313c28802..dfe02cfed346b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -246,9 +246,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
                    MVT::v2f64})
       setOperationAction(ISD::SPLAT_VECTOR, T, Legal);
 
-    // Set avgceilu as legal for i8x16 and i16x8
-    // and isel will convert to AVGR_U w/ tablegen
-    setOperationAction({ISD::AVGCEILU}, {MVT::v8i16, MVT::v16i8}, Legal);
+    setOperationAction(ISD::AVGCEILU, {MVT::v8i16, MVT::v16i8}, Legal);
 
     // Custom lowering since wasm shifts must have a scalar shift amount
     for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})

>From 1fc6cea35cd8d331ee5fd718601b833f9a58e6b6 Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Wed, 20 Aug 2025 14:55:22 -0700
Subject: [PATCH 4/5] Manually reformat tablegen

---
 llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index fb508e3dc9a7f..f06f8d5174e3e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1145,8 +1145,8 @@ def : Pat<(wasm_shr_u
             (i32 1)),
           (inst $lhs, $rhs)>;
 
-def : Pat<(vec.vt(avgceilu(vec.vt V128:$lhs), (vec.vt V128:$rhs))), (inst $lhs,
-                                                                        $rhs)>;
+def : Pat<(vec.vt (avgceilu (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+          (inst $lhs, $rhs)>;
 }
 
 // Widening dot product: i32x4.dot_i16x8_s

>From 050c3464098a98ea0d1fd3a76f3da339e126b179 Mon Sep 17 00:00:00 2001
From: Jasmine Tang <jjasmine at igalia.com>
Date: Thu, 21 Aug 2025 10:27:22 -0700
Subject: [PATCH 5/5] Condensed test cases thanks to Luke

---
 llvm/test/CodeGen/WebAssembly/simd-arith.ll | 541 ++++++++++++++++++++
 llvm/test/CodeGen/WebAssembly/simd-avgr.ll  | 224 --------
 2 files changed, 541 insertions(+), 224 deletions(-)
 delete mode 100644 llvm/test/CodeGen/WebAssembly/simd-avgr.ll

diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 36637e1d555bd..324a0c49fb413 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -1451,6 +1451,547 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i8> %c
 }
 
+define <16 x i8> @avgr_u_v16i8_zext(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: avgr_u_v16i8_zext:
+; SIMD128:         .functype avgr_u_v16i8_zext (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.avgr_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: avgr_u_v16i8_zext:
+; SIMD128-FAST:         .functype avgr_u_v16i8_zext (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.avgr_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: avgr_u_v16i8_zext:
+; NO-SIMD128:         .functype avgr_u_v16i8_zext (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.const $push143=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop143
+; NO-SIMD128-NEXT:    i32.add $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.const $push4=, 1
+; NO-SIMD128-NEXT:    i32.add $push5=, $pop3, $pop4
+; NO-SIMD128-NEXT:    i32.const $push142=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop142
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push141=, 255
+; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop141
+; NO-SIMD128-NEXT:    i32.const $push140=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop140
+; NO-SIMD128-NEXT:    i32.add $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.const $push139=, 1
+; NO-SIMD128-NEXT:    i32.add $push10=, $pop9, $pop139
+; NO-SIMD128-NEXT:    i32.const $push138=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push11=, $pop10, $pop138
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push137=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop137
+; NO-SIMD128-NEXT:    i32.const $push136=, 255
+; NO-SIMD128-NEXT:    i32.and $push12=, $30, $pop136
+; NO-SIMD128-NEXT:    i32.add $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.const $push135=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop135
+; NO-SIMD128-NEXT:    i32.const $push134=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $pop134
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push133=, 255
+; NO-SIMD128-NEXT:    i32.and $push18=, $13, $pop133
+; NO-SIMD128-NEXT:    i32.const $push132=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $29, $pop132
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.const $push131=, 1
+; NO-SIMD128-NEXT:    i32.add $push20=, $pop19, $pop131
+; NO-SIMD128-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop130
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push129=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop129
+; NO-SIMD128-NEXT:    i32.const $push128=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $28, $pop128
+; NO-SIMD128-NEXT:    i32.add $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-NEXT:    i32.add $push25=, $pop24, $pop127
+; NO-SIMD128-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push26=, $pop25, $pop126
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push125=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $11, $pop125
+; NO-SIMD128-NEXT:    i32.const $push124=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $27, $pop124
+; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop27
+; NO-SIMD128-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-NEXT:    i32.add $push30=, $pop29, $pop123
+; NO-SIMD128-NEXT:    i32.const $push122=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop122
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop31
+; NO-SIMD128-NEXT:    i32.const $push121=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop121
+; NO-SIMD128-NEXT:    i32.const $push120=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $26, $pop120
+; NO-SIMD128-NEXT:    i32.add $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.const $push119=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop119
+; NO-SIMD128-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $pop118
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $9, $pop117
+; NO-SIMD128-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $25, $pop116
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-NEXT:    i32.add $push40=, $pop39, $pop115
+; NO-SIMD128-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop114
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop41
+; NO-SIMD128-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $8, $pop113
+; NO-SIMD128-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $24, $pop112
+; NO-SIMD128-NEXT:    i32.add $push44=, $pop43, $pop42
+; NO-SIMD128-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop111
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push46=, $pop45, $pop110
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop46
+; NO-SIMD128-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-NEXT:    i32.and $push48=, $7, $pop109
+; NO-SIMD128-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $23, $pop108
+; NO-SIMD128-NEXT:    i32.add $push49=, $pop48, $pop47
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push50=, $pop49, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push51=, $pop50, $pop106
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop51
+; NO-SIMD128-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $6, $pop105
+; NO-SIMD128-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-NEXT:    i32.and $push52=, $22, $pop104
+; NO-SIMD128-NEXT:    i32.add $push54=, $pop53, $pop52
+; NO-SIMD128-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop103
+; NO-SIMD128-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push56=, $pop55, $pop102
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $5, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $21, $pop100
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-NEXT:    i32.add $push60=, $pop59, $pop99
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop98
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop61
+; NO-SIMD128-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-NEXT:    i32.and $push63=, $4, $pop97
+; NO-SIMD128-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $20, $pop96
+; NO-SIMD128-NEXT:    i32.add $push64=, $pop63, $pop62
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push65=, $pop64, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push66=, $pop65, $pop94
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop66
+; NO-SIMD128-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-NEXT:    i32.and $push68=, $3, $pop93
+; NO-SIMD128-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-NEXT:    i32.and $push67=, $19, $pop92
+; NO-SIMD128-NEXT:    i32.add $push69=, $pop68, $pop67
+; NO-SIMD128-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-NEXT:    i32.add $push70=, $pop69, $pop91
+; NO-SIMD128-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push71=, $pop70, $pop90
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop71
+; NO-SIMD128-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-NEXT:    i32.and $push73=, $2, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-NEXT:    i32.and $push72=, $18, $pop88
+; NO-SIMD128-NEXT:    i32.add $push74=, $pop73, $pop72
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.add $push75=, $pop74, $pop87
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push76=, $pop75, $pop86
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop76
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push78=, $1, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push77=, $17, $pop84
+; NO-SIMD128-NEXT:    i32.add $push79=, $pop78, $pop77
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push80=, $pop79, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push81=, $pop80, $pop82
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop81
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: avgr_u_v16i8_zext:
+; NO-SIMD128-FAST:         .functype avgr_u_v16i8_zext (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push143=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop143
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $pop3, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push142=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop142
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push141=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $2, $pop141
+; NO-SIMD128-FAST-NEXT:    i32.const $push140=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop140
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push139=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $pop9, $pop139
+; NO-SIMD128-FAST-NEXT:    i32.const $push138=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push11=, $pop10, $pop138
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push137=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $3, $pop137
+; NO-SIMD128-FAST-NEXT:    i32.const $push136=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $19, $pop136
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push135=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop135
+; NO-SIMD128-FAST-NEXT:    i32.const $push134=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $pop134
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $20, $pop132
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $pop19, $pop131
+; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $5, $pop129
+; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $21, $pop128
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $pop24, $pop127
+; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push26=, $pop25, $pop126
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $6, $pop125
+; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $22, $pop124
+; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $pop28, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $pop29, $pop123
+; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push31=, $pop30, $pop122
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $7, $pop121
+; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $23, $pop120
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop119
+; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $pop118
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $8, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $24, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $pop39, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop114
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $9, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $25, $pop112
+; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $pop43, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push45=, $pop44, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push46=, $pop45, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $10, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $26, $pop108
+; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $pop49, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $11, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $27, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop103
+; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push56=, $pop55, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $pop59, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $13, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $29, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.add $push64=, $pop63, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $pop64, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push66=, $pop65, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $14, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push67=, $30, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.add $push69=, $pop68, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push70=, $pop69, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push71=, $pop70, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push73=, $15, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push72=, $31, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $pop73, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push75=, $pop74, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push76=, $pop75, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push78=, $16, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $32, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $pop79, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop81
+; NO-SIMD128-FAST-NEXT:    return
+  %x.zext = zext <16 x i8> %x to <16 x i16>
+  %y.zext = zext <16 x i8> %y to <16 x i16>
+  %a = add <16 x i16> %x.zext, %y.zext
+  %b = add <16 x i16> %a, splat (i16 1)
+  %c = lshr <16 x i16> %b, splat (i16 1)
+  %c.trunc = trunc <16 x i16> %c to <16 x i8>
+  ret <16 x i8> %c.trunc
+}
+
+define <8 x i16> @avgr_u_v8i16_zext(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: avgr_u_v8i16_zext:
+; SIMD128:         .functype avgr_u_v8i16_zext (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.avgr_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: avgr_u_v8i16_zext:
+; SIMD128-FAST:         .functype avgr_u_v8i16_zext (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.avgr_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: avgr_u_v8i16_zext:
+; NO-SIMD128:         .functype avgr_u_v8i16_zext (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push71=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop71
+; NO-SIMD128-NEXT:    i32.add $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.const $push4=, 1
+; NO-SIMD128-NEXT:    i32.add $push5=, $pop3, $pop4
+; NO-SIMD128-NEXT:    i32.const $push70=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop70
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push69=, 65535
+; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop69
+; NO-SIMD128-NEXT:    i32.const $push68=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop68
+; NO-SIMD128-NEXT:    i32.add $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.const $push67=, 1
+; NO-SIMD128-NEXT:    i32.add $push10=, $pop9, $pop67
+; NO-SIMD128-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push11=, $pop10, $pop66
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push65=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $6, $pop65
+; NO-SIMD128-NEXT:    i32.const $push64=, 65535
+; NO-SIMD128-NEXT:    i32.and $push12=, $14, $pop64
+; NO-SIMD128-NEXT:    i32.add $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.const $push63=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop63
+; NO-SIMD128-NEXT:    i32.const $push62=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $pop62
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push61=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $5, $pop61
+; NO-SIMD128-NEXT:    i32.const $push60=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $13, $pop60
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.const $push59=, 1
+; NO-SIMD128-NEXT:    i32.add $push20=, $pop19, $pop59
+; NO-SIMD128-NEXT:    i32.const $push58=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop58
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push57=, 65535
+; NO-SIMD128-NEXT:    i32.and $push23=, $4, $pop57
+; NO-SIMD128-NEXT:    i32.const $push56=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $12, $pop56
+; NO-SIMD128-NEXT:    i32.add $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-NEXT:    i32.add $push25=, $pop24, $pop55
+; NO-SIMD128-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push26=, $pop25, $pop54
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push53=, 65535
+; NO-SIMD128-NEXT:    i32.and $push28=, $3, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 65535
+; NO-SIMD128-NEXT:    i32.and $push27=, $11, $pop52
+; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop27
+; NO-SIMD128-NEXT:    i32.const $push51=, 1
+; NO-SIMD128-NEXT:    i32.add $push30=, $pop29, $pop51
+; NO-SIMD128-NEXT:    i32.const $push50=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop50
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
+; NO-SIMD128-NEXT:    i32.const $push49=, 65535
+; NO-SIMD128-NEXT:    i32.and $push33=, $2, $pop49
+; NO-SIMD128-NEXT:    i32.const $push48=, 65535
+; NO-SIMD128-NEXT:    i32.and $push32=, $10, $pop48
+; NO-SIMD128-NEXT:    i32.add $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.const $push47=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop45
+; NO-SIMD128-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop44
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-NEXT:    i32.add $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.const $push42=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: avgr_u_v8i16_zext:
+; NO-SIMD128-FAST:         .functype avgr_u_v8i16_zext (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $pop3, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $2, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $10, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $pop9, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push11=, $pop10, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $3, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $11, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $4, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $12, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $pop19, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $5, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $13, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $pop24, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push26=, $pop25, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $6, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $14, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $pop28, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $pop29, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push31=, $pop30, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $7, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $15, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $8, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $16, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop41
+; NO-SIMD128-FAST-NEXT:    return
+  %x.zext = zext <8 x i16> %x to <8 x i32>
+  %y.zext = zext <8 x i16> %y to <8 x i32>
+  %a = add <8 x i32> %x.zext, %y.zext
+  %b = add <8 x i32> %a, splat (i32 1)
+  %c = lshr <8 x i32> %b, splat (i32 1)
+  %c.trunc = trunc <8 x i32> %c to <8 x i16>
+  ret <8 x i16> %c.trunc
+}
 define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; SIMD128-LABEL: avgr_u_v16i8_wrap:
 ; SIMD128:         .functype avgr_u_v16i8_wrap (v128, v128) -> (v128)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-avgr.ll b/llvm/test/CodeGen/WebAssembly/simd-avgr.ll
deleted file mode 100644
index c4e6b12244e9b..0000000000000
--- a/llvm/test/CodeGen/WebAssembly/simd-avgr.ll
+++ /dev/null
@@ -1,224 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -O2 -mtriple=wasm32 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
-
-;void f(unsigned char *x, unsigned char *y, int n) {
-;  for (int i = 0; i < n; i++)
-;    x[i] = (x[i] + y[i] + 1) / 2;
-;}
-
-define void @f(ptr %x, ptr %y, i32 %n) {
-; CHECK-LABEL: f:
-; CHECK:         .functype f (i32, i32, i32) -> ()
-; CHECK-NEXT:  # %bb.0: # %entry
-; CHECK-NEXT:    block
-; CHECK-NEXT:    i32.const $push0=, 1
-; CHECK-NEXT:    i32.lt_s $push1=, $2, $pop0
-; CHECK-NEXT:    br_if 0, $pop1 # 0: down to label0
-; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    i32.const $3=, 0
-; CHECK-NEXT:    block
-; CHECK-NEXT:    i32.const $push2=, 16
-; CHECK-NEXT:    i32.lt_u $push3=, $2, $pop2
-; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label1
-; CHECK-NEXT:  # %bb.2: # %vector.memcheck
-; CHECK-NEXT:    block
-; CHECK-NEXT:    i32.add $push5=, $1, $2
-; CHECK-NEXT:    i32.ge_u $push6=, $0, $pop5
-; CHECK-NEXT:    br_if 0, $pop6 # 0: down to label2
-; CHECK-NEXT:  # %bb.3: # %vector.memcheck
-; CHECK-NEXT:    i32.add $push4=, $0, $2
-; CHECK-NEXT:    i32.lt_u $push7=, $1, $pop4
-; CHECK-NEXT:    br_if 1, $pop7 # 1: down to label1
-; CHECK-NEXT:  .LBB0_4: # %vector.ph
-; CHECK-NEXT:    end_block # label2:
-; CHECK-NEXT:    local.copy $4=, $0
-; CHECK-NEXT:    local.copy $5=, $1
-; CHECK-NEXT:    i32.const $push8=, 2147483632
-; CHECK-NEXT:    i32.and $push33=, $2, $pop8
-; CHECK-NEXT:    local.tee $push32=, $3=, $pop33
-; CHECK-NEXT:    local.copy $6=, $pop32
-; CHECK-NEXT:  .LBB0_5: # %vector.body
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    loop # label3:
-; CHECK-NEXT:    v128.load $push10=, 0($4):p2align=0
-; CHECK-NEXT:    v128.load $push9=, 0($5):p2align=0
-; CHECK-NEXT:    i8x16.avgr_u $push11=, $pop10, $pop9
-; CHECK-NEXT:    v128.store 0($4):p2align=0, $pop11
-; CHECK-NEXT:    i32.const $push38=, 16
-; CHECK-NEXT:    i32.add $4=, $4, $pop38
-; CHECK-NEXT:    i32.const $push37=, 16
-; CHECK-NEXT:    i32.add $5=, $5, $pop37
-; CHECK-NEXT:    i32.const $push36=, -16
-; CHECK-NEXT:    i32.add $push35=, $6, $pop36
-; CHECK-NEXT:    local.tee $push34=, $6=, $pop35
-; CHECK-NEXT:    br_if 0, $pop34 # 0: up to label3
-; CHECK-NEXT:  # %bb.6: # %middle.block
-; CHECK-NEXT:    end_loop
-; CHECK-NEXT:    i32.eq $push12=, $2, $3
-; CHECK-NEXT:    br_if 1, $pop12 # 1: down to label0
-; CHECK-NEXT:  .LBB0_7: # %for.body.preheader16
-; CHECK-NEXT:    end_block # label1:
-; CHECK-NEXT:    i32.const $push40=, 1
-; CHECK-NEXT:    i32.or $4=, $3, $pop40
-; CHECK-NEXT:    block
-; CHECK-NEXT:    i32.const $push39=, 1
-; CHECK-NEXT:    i32.and $push13=, $2, $pop39
-; CHECK-NEXT:    i32.eqz $push58=, $pop13
-; CHECK-NEXT:    br_if 0, $pop58 # 0: down to label4
-; CHECK-NEXT:  # %bb.8: # %for.body.prol
-; CHECK-NEXT:    i32.add $push44=, $0, $3
-; CHECK-NEXT:    local.tee $push43=, $5=, $pop44
-; CHECK-NEXT:    i32.load8_u $push16=, 0($5)
-; CHECK-NEXT:    i32.add $push14=, $1, $3
-; CHECK-NEXT:    i32.load8_u $push15=, 0($pop14)
-; CHECK-NEXT:    i32.add $push17=, $pop16, $pop15
-; CHECK-NEXT:    i32.const $push42=, 1
-; CHECK-NEXT:    i32.add $push18=, $pop17, $pop42
-; CHECK-NEXT:    i32.const $push41=, 1
-; CHECK-NEXT:    i32.shr_u $push19=, $pop18, $pop41
-; CHECK-NEXT:    i32.store8 0($pop43), $pop19
-; CHECK-NEXT:    local.copy $3=, $4
-; CHECK-NEXT:  .LBB0_9: # %for.body.prol.loopexit
-; CHECK-NEXT:    end_block # label4:
-; CHECK-NEXT:    i32.eq $push20=, $2, $4
-; CHECK-NEXT:    br_if 0, $pop20 # 0: down to label0
-; CHECK-NEXT:  # %bb.10: # %for.body.preheader1
-; CHECK-NEXT:    i32.add $4=, $0, $3
-; CHECK-NEXT:    i32.add $5=, $1, $3
-; CHECK-NEXT:    i32.sub $6=, $2, $3
-; CHECK-NEXT:  .LBB0_11: # %for.body
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    loop # label5:
-; CHECK-NEXT:    i32.load8_u $push22=, 0($4)
-; CHECK-NEXT:    i32.load8_u $push21=, 0($5)
-; CHECK-NEXT:    i32.add $push23=, $pop22, $pop21
-; CHECK-NEXT:    i32.const $push57=, 1
-; CHECK-NEXT:    i32.add $push24=, $pop23, $pop57
-; CHECK-NEXT:    i32.const $push56=, 1
-; CHECK-NEXT:    i32.shr_u $push25=, $pop24, $pop56
-; CHECK-NEXT:    i32.store8 0($4), $pop25
-; CHECK-NEXT:    i32.const $push55=, 1
-; CHECK-NEXT:    i32.add $push54=, $4, $pop55
-; CHECK-NEXT:    local.tee $push53=, $2=, $pop54
-; CHECK-NEXT:    i32.load8_u $push26=, 0($2)
-; CHECK-NEXT:    i32.const $push52=, 1
-; CHECK-NEXT:    i32.add $push27=, $5, $pop52
-; CHECK-NEXT:    i32.load8_u $push28=, 0($pop27)
-; CHECK-NEXT:    i32.add $push29=, $pop26, $pop28
-; CHECK-NEXT:    i32.const $push51=, 1
-; CHECK-NEXT:    i32.add $push30=, $pop29, $pop51
-; CHECK-NEXT:    i32.const $push50=, 1
-; CHECK-NEXT:    i32.shr_u $push31=, $pop30, $pop50
-; CHECK-NEXT:    i32.store8 0($pop53), $pop31
-; CHECK-NEXT:    i32.const $push49=, 2
-; CHECK-NEXT:    i32.add $4=, $4, $pop49
-; CHECK-NEXT:    i32.const $push48=, 2
-; CHECK-NEXT:    i32.add $5=, $5, $pop48
-; CHECK-NEXT:    i32.const $push47=, -2
-; CHECK-NEXT:    i32.add $push46=, $6, $pop47
-; CHECK-NEXT:    local.tee $push45=, $6=, $pop46
-; CHECK-NEXT:    br_if 0, $pop45 # 0: up to label5
-; CHECK-NEXT:  .LBB0_12: # %for.cond.cleanup
-; CHECK-NEXT:    end_loop
-; CHECK-NEXT:    end_block # label0:
-; CHECK-NEXT:    return
-entry:
-  %cmp12 = icmp sgt i32 %n, 0
-  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %min.iters.check = icmp ult i32 %n, 16
-  br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck
-
-vector.memcheck:
-  %scevgep = getelementptr i8, ptr %x, i32 %n
-  %scevgep14 = getelementptr i8, ptr %y, i32 %n
-  %bound0 = icmp ult ptr %x, %scevgep14
-  %bound1 = icmp ult ptr %y, %scevgep
-  %found.conflict = and i1 %bound0, %bound1
-  br i1 %found.conflict, label %for.body.preheader16, label %vector.ph
-
-vector.ph:
-  %n.vec = and i32 %n, 2147483632
-  br label %vector.body
-
-vector.body:
-  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %0 = getelementptr inbounds nuw i8, ptr %x, i32 %index
-  %wide.load = load <16 x i8>, ptr %0, align 1
-  %1 = zext <16 x i8> %wide.load to <16 x i16>
-  %2 = getelementptr inbounds nuw i8, ptr %y, i32 %index
-  %wide.load15 = load <16 x i8>, ptr %2, align 1
-  %3 = zext <16 x i8> %wide.load15 to <16 x i16>
-  %4 = add nuw nsw <16 x i16> %1, splat (i16 1)
-  %5 = add nuw nsw <16 x i16> %4, %3
-  %6 = lshr <16 x i16> %5, splat (i16 1)
-  %7 = trunc nuw <16 x i16> %6 to <16 x i8>
-  store <16 x i8> %7, ptr %0, align 1
-  %index.next = add nuw i32 %index, 16
-  %8 = icmp eq i32 %index.next, %n.vec
-  br i1 %8, label %middle.block, label %vector.body
-
-middle.block:
-  %cmp.n = icmp eq i32 %n, %n.vec
-  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16
-
-for.body.preheader16:
-  %i.013.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
-  %.neg = or disjoint i32 %i.013.ph, 1
-  %xtraiter = and i32 %n, 1
-  %lcmp.mod.not = icmp eq i32 %xtraiter, 0
-  br i1 %lcmp.mod.not, label %for.body.prol.loopexit, label %for.body.prol
-
-for.body.prol:
-  %arrayidx.prol = getelementptr inbounds nuw i8, ptr %x, i32 %i.013.ph
-  %9 = load i8, ptr %arrayidx.prol, align 1
-  %conv.prol = zext i8 %9 to i16
-  %arrayidx1.prol = getelementptr inbounds nuw i8, ptr %y, i32 %i.013.ph
-  %10 = load i8, ptr %arrayidx1.prol, align 1
-  %conv2.prol = zext i8 %10 to i16
-  %add.prol = add nuw nsw i16 %conv.prol, 1
-  %add3.prol = add nuw nsw i16 %add.prol, %conv2.prol
-  %div11.prol = lshr i16 %add3.prol, 1
-  %conv4.prol = trunc nuw i16 %div11.prol to i8
-  store i8 %conv4.prol, ptr %arrayidx.prol, align 1
-  %inc.prol = or disjoint i32 %i.013.ph, 1
-  br label %for.body.prol.loopexit
-
-for.body.prol.loopexit:
-  %i.013.unr = phi i32 [ %i.013.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ]
-  %11 = icmp eq i32 %n, %.neg
-  br i1 %11, label %for.cond.cleanup, label %for.body
-
-for.cond.cleanup:
-  ret void
-
-for.body:
-  %i.013 = phi i32 [ %inc.1, %for.body ], [ %i.013.unr, %for.body.prol.loopexit ]
-  %arrayidx = getelementptr inbounds nuw i8, ptr %x, i32 %i.013
-  %12 = load i8, ptr %arrayidx, align 1
-  %conv = zext i8 %12 to i16
-  %arrayidx1 = getelementptr inbounds nuw i8, ptr %y, i32 %i.013
-  %13 = load i8, ptr %arrayidx1, align 1
-  %conv2 = zext i8 %13 to i16
-  %add = add nuw nsw i16 %conv, 1
-  %add3 = add nuw nsw i16 %add, %conv2
-  %div11 = lshr i16 %add3, 1
-  %conv4 = trunc nuw i16 %div11 to i8
-  store i8 %conv4, ptr %arrayidx, align 1
-  %inc = add nuw nsw i32 %i.013, 1
-  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i32 %inc
-  %14 = load i8, ptr %arrayidx.1, align 1
-  %conv.1 = zext i8 %14 to i16
-  %arrayidx1.1 = getelementptr inbounds nuw i8, ptr %y, i32 %inc
-  %15 = load i8, ptr %arrayidx1.1, align 1
-  %conv2.1 = zext i8 %15 to i16
-  %add.1 = add nuw nsw i16 %conv.1, 1
-  %add3.1 = add nuw nsw i16 %add.1, %conv2.1
-  %div11.1 = lshr i16 %add3.1, 1
-  %conv4.1 = trunc nuw i16 %div11.1 to i8
-  store i8 %conv4.1, ptr %arrayidx.1, align 1
-  %inc.1 = add nuw nsw i32 %i.013, 2
-  %exitcond.not.1 = icmp eq i32 %inc.1, %n
-  br i1 %exitcond.not.1, label %for.cond.cleanup, label %for.body
-}



More information about the llvm-commits mailing list