[llvm] 41080b2 - [NFC][WebAssembly] Updated tests

Samuel Parker via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 26 02:30:38 PST 2023


Author: Samuel Parker
Date: 2023-01-26T10:26:24Z
New Revision: 41080b2fdd4b6c57d5a2926d6157b9847342b3a1

URL: https://github.com/llvm/llvm-project/commit/41080b2fdd4b6c57d5a2926d6157b9847342b3a1
DIFF: https://github.com/llvm/llvm-project/commit/41080b2fdd4b6c57d5a2926d6157b9847342b3a1.diff

LOG: [NFC][WebAssembly] Updated tests

Run update_llc_test_checks on a number of codegen tests.

Added: 
    

Modified: 
    llvm/test/CodeGen/WebAssembly/comparisons-f32.ll
    llvm/test/CodeGen/WebAssembly/comparisons-f64.ll
    llvm/test/CodeGen/WebAssembly/f32.ll
    llvm/test/CodeGen/WebAssembly/f64.ll
    llvm/test/CodeGen/WebAssembly/i128.ll
    llvm/test/CodeGen/WebAssembly/libcalls.ll
    llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll
    llvm/test/CodeGen/WebAssembly/offset-fastisel.ll
    llvm/test/CodeGen/WebAssembly/return-int32.ll
    llvm/test/CodeGen/WebAssembly/return-void.ll
    llvm/test/CodeGen/WebAssembly/returned.ll
    llvm/test/CodeGen/WebAssembly/select.ll
    llvm/test/CodeGen/WebAssembly/simd-arith.ll
    llvm/test/CodeGen/WebAssembly/simd-build-pair.ll
    llvm/test/CodeGen/WebAssembly/simd-illegal-signext.ll
    llvm/test/CodeGen/WebAssembly/simd.ll
    llvm/test/CodeGen/WebAssembly/stack-protector.ll
    llvm/test/CodeGen/WebAssembly/umulo-i64.ll
    llvm/test/CodeGen/WebAssembly/userstack.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/WebAssembly/comparisons-f32.ll b/llvm/test/CodeGen/WebAssembly/comparisons-f32.ll
index f4144f86e7066..b26b5027863fb 100644
--- a/llvm/test/CodeGen/WebAssembly/comparisons-f32.ll
+++ b/llvm/test/CodeGen/WebAssembly/comparisons-f32.ll
@@ -1,94 +1,118 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
 
 ; Test that basic 32-bit floating-point comparison operations assemble as
 ; expected.
 
 target triple = "wasm32-unknown-unknown"
 
-; CHECK-LABEL: ord_f32:
-; CHECK-NEXT: .functype ord_f32 (f32, f32) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: f32.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: local.get $push[[L2:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: local.get $push[[L3:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.eq $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
-; CHECK-NEXT: i32.and $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ord_f32(float %x, float %y) {
+; CHECK-LABEL: ord_f32:
+; CHECK:         .functype ord_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 0
+; CHECK-NEXT:    f32.eq $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    f32.eq $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.and $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
   %a = fcmp ord float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: uno_f32:
-; CHECK-NEXT: .functype uno_f32 (f32, f32) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: local.get $push[[L2:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: local.get $push[[L3:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
-; CHECK-NEXT: i32.or $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @uno_f32(float %x, float %y) {
+; CHECK-LABEL: uno_f32:
+; CHECK:         .functype uno_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 0
+; CHECK-NEXT:    f32.ne $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    f32.ne $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.or $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
   %a = fcmp uno float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: oeq_f32:
-; CHECK-NEXT: .functype oeq_f32 (f32, f32) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @oeq_f32(float %x, float %y) {
+; CHECK-LABEL: oeq_f32:
+; CHECK:         .functype oeq_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.eq $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp oeq float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: une_f32:
-; CHECK: f32.ne $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @une_f32(float %x, float %y) {
+; CHECK-LABEL: une_f32:
+; CHECK:         .functype une_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.ne $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp une float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: olt_f32:
-; CHECK: f32.lt $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @olt_f32(float %x, float %y) {
+; CHECK-LABEL: olt_f32:
+; CHECK:         .functype olt_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.lt $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp olt float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ole_f32:
-; CHECK: f32.le $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ole_f32(float %x, float %y) {
+; CHECK-LABEL: ole_f32:
+; CHECK:         .functype ole_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.le $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp ole float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ogt_f32:
-; CHECK: f32.gt $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ogt_f32(float %x, float %y) {
+; CHECK-LABEL: ogt_f32:
+; CHECK:         .functype ogt_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.gt $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp ogt float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: oge_f32:
-; CHECK: f32.ge $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @oge_f32(float %x, float %y) {
+; CHECK-LABEL: oge_f32:
+; CHECK:         .functype oge_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.ge $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp oge float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
@@ -97,104 +121,117 @@ define i32 @oge_f32(float %x, float %y) {
 ; Expanded comparisons, which also check for NaN.
 ; These simply rely on SDAG's Expand cond code action.
 
-; CHECK-LABEL: ueq_f32:
-; CHECK-NEXT: .functype ueq_f32 (f32, f32) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: local.get $push[[L2:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L3:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.lt $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
-; CHECK-NEXT: i32.or $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM3]]{{$}}
 define i32 @ueq_f32(float %x, float %y) {
+; CHECK-LABEL: ueq_f32:
+; CHECK:         .functype ueq_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    f32.gt $push1=, $pop6, $pop5
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    local.get $push7=, 1
+; CHECK-NEXT:    f32.lt $push0=, $pop8, $pop7
+; CHECK-NEXT:    i32.or $push2=, $pop1, $pop0
+; CHECK-NEXT:    i32.const $push3=, 1
+; CHECK-NEXT:    i32.xor $push4=, $pop2, $pop3
+; CHECK-NEXT:    return $pop4
   %a = fcmp ueq float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: one_f32:
-; CHECK-NEXT: .functype one_f32 (f32, f32) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: local.get $push[[L2:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L3:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.lt $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
-; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM4]]
 define i32 @one_f32(float %x, float %y) {
+; CHECK-LABEL: one_f32:
+; CHECK:         .functype one_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f32.gt $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    f32.lt $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.or $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
   %a = fcmp one float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ult_f32:
-; CHECK-NEXT: .functype ult_f32 (f32, f32) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.ge $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ult_f32(float %x, float %y) {
+; CHECK-LABEL: ult_f32:
+; CHECK:         .functype ult_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f32.ge $push0=, $pop4, $pop3
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.xor $push2=, $pop0, $pop1
+; CHECK-NEXT:    return $pop2
   %a = fcmp ult float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ule_f32:
-; CHECK-NEXT: .functype ule_f32 (f32, f32) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ule_f32(float %x, float %y) {
+; CHECK-LABEL: ule_f32:
+; CHECK:         .functype ule_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f32.gt $push0=, $pop4, $pop3
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.xor $push2=, $pop0, $pop1
+; CHECK-NEXT:    return $pop2
   %a = fcmp ule float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ugt_f32:
-; CHECK-NEXT: .functype ugt_f32 (f32, f32) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.le $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ugt_f32(float %x, float %y) {
+; CHECK-LABEL: ugt_f32:
+; CHECK:         .functype ugt_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f32.le $push0=, $pop4, $pop3
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.xor $push2=, $pop0, $pop1
+; CHECK-NEXT:    return $pop2
   %a = fcmp ugt float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: uge_f32:
-; CHECK-NEXT: .functype uge_f32 (f32, f32) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.lt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @uge_f32(float %x, float %y) {
+; CHECK-LABEL: uge_f32:
+; CHECK:         .functype uge_f32 (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f32.lt $push0=, $pop4, $pop3
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.xor $push2=, $pop0, $pop1
+; CHECK-NEXT:    return $pop2
   %a = fcmp uge float %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: olt_f32_branch
-; CHECK:      local.get	$push[[L4:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L3:[0-9]+]]=, 1
-; CHECK-NEXT: f32.lt  	$push[[NUM0:[0-9]+]]=, $pop[[L4]], $pop[[L3]]
-; CHECK-NEXT: i32.eqz 	$push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @olt_f32_branch(float %a, float %b) {
+; CHECK-LABEL: olt_f32_branch:
+; CHECK:         .functype olt_f32_branch (f32, f32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.lt $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label0
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB14_2: # %if.end
+; CHECK-NEXT:    end_block # label0:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp olt float %a, %b
   br i1 %cmp, label %if.then, label %if.end
@@ -207,14 +244,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ole_f32_branch
-; CHECK:      local.get	$push[[L4:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L3:[0-9]+]]=, 1
-; CHECK-NEXT: f32.le  	$push[[NUM0:[0-9]+]]=, $pop[[L4]], $pop[[L3]]
-; CHECK-NEXT: i32.eqz 	$push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ole_f32_branch(float %a, float %b) {
+; CHECK-LABEL: ole_f32_branch:
+; CHECK:         .functype ole_f32_branch (f32, f32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.le $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label1
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB15_2: # %if.end
+; CHECK-NEXT:    end_block # label1:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ole float %a, %b
   br i1 %cmp, label %if.then, label %if.end
@@ -227,14 +271,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ugt_f32_branch
-; CHECK:      local.get	$push[[L4:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L3:[0-9]+]]=, 1
-; CHECK-NEXT: f32.le  	$push[[NUM0:[0-9]+]]=, $pop[[L4]], $pop[[L3]]
-; CHECK-NEXT: i32.eqz 	$push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ugt_f32_branch(float %a, float %b) {
+; CHECK-LABEL: ugt_f32_branch:
+; CHECK:         .functype ugt_f32_branch (f32, f32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.le $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label2
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB16_2: # %if.end
+; CHECK-NEXT:    end_block # label2:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ugt float %a, %b
   br i1 %cmp, label %if.end, label %if.then
@@ -247,14 +298,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ogt_f32_branch
-; CHECK:      local.get	$push[[L4:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L3:[0-9]+]]=, 1
-; CHECK-NEXT: f32.gt  	$push[[NUM0:[0-9]+]]=, $pop[[L4]], $pop[[L3]]
-; CHECK-NEXT: i32.eqz 	$push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ogt_f32_branch(float %a, float %b) {
+; CHECK-LABEL: ogt_f32_branch:
+; CHECK:         .functype ogt_f32_branch (f32, f32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.gt $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label3
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB17_2: # %if.end
+; CHECK-NEXT:    end_block # label3:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ogt float %a, %b
   br i1 %cmp, label %if.then, label %if.end
@@ -267,14 +325,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ult_f32_branch
-; CHECK:      local.get	$push[[L4:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L3:[0-9]+]]=, 1
-; CHECK-NEXT: f32.ge  	$push[[NUM0:[0-9]+]]=, $pop[[L4]], $pop[[L3]]
-; CHECK-NEXT: i32.eqz 	$push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ult_f32_branch(float %a, float %b) {
+; CHECK-LABEL: ult_f32_branch:
+; CHECK:         .functype ult_f32_branch (f32, f32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.ge $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label4
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB18_2: # %if.end
+; CHECK-NEXT:    end_block # label4:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ult float %a, %b
   br i1 %cmp, label %if.end, label %if.then
@@ -287,14 +352,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ule_f32_branch
-; CHECK:      local.get	$push[[L4:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L3:[0-9]+]]=, 1
-; CHECK-NEXT: f32.ge  	$push[[NUM0:[0-9]+]]=, $pop[[L4]], $pop[[L3]]
-; CHECK-NEXT: i32.eqz 	$push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ule_f32_branch(float %a, float %b) {
+; CHECK-LABEL: ule_f32_branch:
+; CHECK:         .functype ule_f32_branch (f32, f32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.ge $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label5
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB19_2: # %if.end
+; CHECK-NEXT:    end_block # label5:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ult float %a, %b
   br i1 %cmp, label %if.end, label %if.then
@@ -307,16 +379,31 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: xor_zext_switch
-; CHECK:      i32.const	$push[[L1:[0-9]+]]=, 0
-; CHECK-NEXT: br_if   	0, $pop[[L1]]
-; CHECK-NEXT: block
-; CHECK-NEXT: block
-; CHECK-NEXT: local.get	$push[[L3:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L2:[0-9]+]]=, 1
-; CHECK-NEXT: f32.ge  	$push[[L0:[0-9]+]]=, $pop[[L3]], $pop[[L2]]
-; CHECK-NEXT: br_table 	$pop[[L0]], 0, 1, 0
 define void @xor_zext_switch(float %a, float %b) {
+; CHECK-LABEL: xor_zext_switch:
+; CHECK:         .functype xor_zext_switch (f32, f32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    i32.const $push1=, 0
+; CHECK-NEXT:    br_if 0, $pop1 # 0: down to label6
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push3=, 0
+; CHECK-NEXT:    local.get $push2=, 1
+; CHECK-NEXT:    f32.ge $push0=, $pop3, $pop2
+; CHECK-NEXT:    br_table $pop0, 0, 1, 0 # 0: down to label8
+; CHECK-NEXT:    # 1: down to label7
+; CHECK-NEXT:  .LBB20_2: # %sw.bb.1
+; CHECK-NEXT:    end_block # label8:
+; CHECK-NEXT:    call foo1
+; CHECK-NEXT:    return
+; CHECK-NEXT:  .LBB20_3: # %sw.bb.2
+; CHECK-NEXT:    end_block # label7:
+; CHECK-NEXT:    call foo2
+; CHECK-NEXT:  .LBB20_4: # %exit
+; CHECK-NEXT:    end_block # label6:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ult float %a, %b
   %zext = zext i1 %cmp to i32
@@ -338,18 +425,41 @@ exit:
   ret void
 }
 
-; CHECK-LABEL: xor_add_switch
-; CHECK:      local.get	$push[[L8:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L7:[0-9]+]]=, 1
-; CHECK-NEXT: f32.ge  	$push[[L1:[0-9]+]]=, $pop[[L8]], $pop[[L7]]
-; CHECK-NEXT: i32.const	$push[[L2:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor 	$push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]
-; CHECK-NEXT: i32.const	$push[[L6:[0-9]+]]=, 1
-; CHECK-NEXT: i32.add 	$push[[L4:[0-9]+]]=, $pop[[L3]], $pop[[L6]]
-; CHECK-NEXT: i32.const	$push[[L5:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor 	$push[[L0:[0-9]+]]=, $pop[[L4]], $pop[[L5]]
-; CHECK-NEXT: br_table 	$pop[[L0]], 0, 1, 2, 3
 define void @xor_add_switch(float %a, float %b) {
+; CHECK-LABEL: xor_add_switch:
+; CHECK:         .functype xor_add_switch (f32, f32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    local.get $push7=, 1
+; CHECK-NEXT:    f32.ge $push1=, $pop8, $pop7
+; CHECK-NEXT:    i32.const $push2=, 1
+; CHECK-NEXT:    i32.xor $push3=, $pop1, $pop2
+; CHECK-NEXT:    i32.const $push6=, 1
+; CHECK-NEXT:    i32.add $push4=, $pop3, $pop6
+; CHECK-NEXT:    i32.const $push5=, 1
+; CHECK-NEXT:    i32.xor $push0=, $pop4, $pop5
+; CHECK-NEXT:    br_table $pop0, 0, 1, 2, 3 # 0: down to label12
+; CHECK-NEXT:    # 1: down to label11
+; CHECK-NEXT:    # 2: down to label10
+; CHECK-NEXT:    # 3: down to label9
+; CHECK-NEXT:  .LBB21_1: # %sw.bb.1
+; CHECK-NEXT:    end_block # label12:
+; CHECK-NEXT:    call foo1
+; CHECK-NEXT:    return
+; CHECK-NEXT:  .LBB21_2: # %sw.bb.2
+; CHECK-NEXT:    end_block # label11:
+; CHECK-NEXT:    call foo2
+; CHECK-NEXT:    return
+; CHECK-NEXT:  .LBB21_3: # %sw.bb.3
+; CHECK-NEXT:    end_block # label10:
+; CHECK-NEXT:    call foo3
+; CHECK-NEXT:  .LBB21_4: # %exit
+; CHECK-NEXT:    end_block # label9:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ult float %a, %b
   %zext = zext i1 %cmp to i32

diff  --git a/llvm/test/CodeGen/WebAssembly/comparisons-f64.ll b/llvm/test/CodeGen/WebAssembly/comparisons-f64.ll
index 03a85f4e7b8d0..30828adfa9f95 100644
--- a/llvm/test/CodeGen/WebAssembly/comparisons-f64.ll
+++ b/llvm/test/CodeGen/WebAssembly/comparisons-f64.ll
@@ -1,94 +1,118 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
 
 ; Test that basic 64-bit floating-point comparison operations assemble as
 ; expected.
 
 target triple = "wasm32-unknown-unknown"
 
-; CHECK-LABEL: ord_f64:
-; CHECK-NEXT: .functype ord_f64 (f64, f64) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: f64.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: local.get $push[[L2:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: local.get $push[[L3:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.eq $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
-; CHECK-NEXT: i32.and $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ord_f64(double %x, double %y) {
+; CHECK-LABEL: ord_f64:
+; CHECK:         .functype ord_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 0
+; CHECK-NEXT:    f64.eq $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    f64.eq $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.and $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
   %a = fcmp ord double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: uno_f64:
-; CHECK-NEXT: .functype uno_f64 (f64, f64) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: local.get $push[[L2:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: local.get $push[[L3:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
-; CHECK-NEXT: i32.or $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @uno_f64(double %x, double %y) {
+; CHECK-LABEL: uno_f64:
+; CHECK:         .functype uno_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 0
+; CHECK-NEXT:    f64.ne $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    f64.ne $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.or $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
   %a = fcmp uno double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: oeq_f64:
-; CHECK-NEXT: .functype oeq_f64 (f64, f64) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @oeq_f64(double %x, double %y) {
+; CHECK-LABEL: oeq_f64:
+; CHECK:         .functype oeq_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.eq $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp oeq double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: une_f64:
-; CHECK: f64.ne $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @une_f64(double %x, double %y) {
+; CHECK-LABEL: une_f64:
+; CHECK:         .functype une_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.ne $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp une double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: olt_f64:
-; CHECK: f64.lt $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @olt_f64(double %x, double %y) {
+; CHECK-LABEL: olt_f64:
+; CHECK:         .functype olt_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.lt $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp olt double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ole_f64:
-; CHECK: f64.le $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ole_f64(double %x, double %y) {
+; CHECK-LABEL: ole_f64:
+; CHECK:         .functype ole_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.le $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp ole double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ogt_f64:
-; CHECK: f64.gt $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ogt_f64(double %x, double %y) {
+; CHECK-LABEL: ogt_f64:
+; CHECK:         .functype ogt_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.gt $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp ogt double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: oge_f64:
-; CHECK: f64.ge $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @oge_f64(double %x, double %y) {
+; CHECK-LABEL: oge_f64:
+; CHECK:         .functype oge_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.ge $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fcmp oge double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
@@ -96,104 +120,117 @@ define i32 @oge_f64(double %x, double %y) {
 
 ; Expanded comparisons, which also check for NaN.
 
-; CHECK-LABEL: ueq_f64:
-; CHECK-NEXT: .functype ueq_f64 (f64, f64) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: local.get $push[[L2:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L3:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.lt $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
-; CHECK-NEXT: i32.or $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM3]]{{$}}
 define i32 @ueq_f64(double %x, double %y) {
+; CHECK-LABEL: ueq_f64:
+; CHECK:         .functype ueq_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    f64.gt $push1=, $pop6, $pop5
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    local.get $push7=, 1
+; CHECK-NEXT:    f64.lt $push0=, $pop8, $pop7
+; CHECK-NEXT:    i32.or $push2=, $pop1, $pop0
+; CHECK-NEXT:    i32.const $push3=, 1
+; CHECK-NEXT:    i32.xor $push4=, $pop2, $pop3
+; CHECK-NEXT:    return $pop4
   %a = fcmp ueq double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: one_f64:
-; CHECK-NEXT: .functype one_f64 (f64, f64) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: local.get $push[[L2:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L3:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.lt $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
-; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
-; CHECK-NEXT: return $pop[[NUM4]]
 define i32 @one_f64(double %x, double %y) {
+; CHECK-LABEL: one_f64:
+; CHECK:         .functype one_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f64.gt $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    f64.lt $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.or $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
   %a = fcmp one double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ult_f64:
-; CHECK-NEXT: .functype ult_f64 (f64, f64) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.ge $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ult_f64(double %x, double %y) {
+; CHECK-LABEL: ult_f64:
+; CHECK:         .functype ult_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f64.ge $push0=, $pop4, $pop3
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.xor $push2=, $pop0, $pop1
+; CHECK-NEXT:    return $pop2
   %a = fcmp ult double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ule_f64:
-; CHECK-NEXT: .functype ule_f64 (f64, f64) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ule_f64(double %x, double %y) {
+; CHECK-LABEL: ule_f64:
+; CHECK:         .functype ule_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f64.gt $push0=, $pop4, $pop3
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.xor $push2=, $pop0, $pop1
+; CHECK-NEXT:    return $pop2
   %a = fcmp ule double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: ugt_f64:
-; CHECK-NEXT: .functype ugt_f64 (f64, f64) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.le $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ugt_f64(double %x, double %y) {
+; CHECK-LABEL: ugt_f64:
+; CHECK:         .functype ugt_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f64.le $push0=, $pop4, $pop3
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.xor $push2=, $pop0, $pop1
+; CHECK-NEXT:    return $pop2
   %a = fcmp ugt double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: uge_f64:
-; CHECK-NEXT: .functype uge_f64 (f64, f64) -> (i32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.lt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: i32.const $push[[C0:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[C0]]{{$}}
-; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @uge_f64(double %x, double %y) {
+; CHECK-LABEL: uge_f64:
+; CHECK:         .functype uge_f64 (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    f64.lt $push0=, $pop4, $pop3
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.xor $push2=, $pop0, $pop1
+; CHECK-NEXT:    return $pop2
   %a = fcmp uge double %x, %y
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: olt_f64_branch:
-; CHECK:      local.get	$push[[L0:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L1:[0-9]+]]=, 1
-; CHECK-NEXT: f64.lt  	$push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; CHECK-NEXT: i32.eqz   $push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @olt_f64_branch(double %a, double %b) {
+; CHECK-LABEL: olt_f64_branch:
+; CHECK:         .functype olt_f64_branch (f64, f64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.lt $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label0
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB14_2: # %if.end
+; CHECK-NEXT:    end_block # label0:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp olt double %a, %b
   br i1 %cmp, label %if.then, label %if.end
@@ -206,14 +243,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ole_f64_branch:
-; CHECK:      local.get	$push[[L0:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L1:[0-9]+]]=, 1
-; CHECK-NEXT: f64.le  	$push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; CHECK-NEXT: i32.eqz   $push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ole_f64_branch(double %a, double %b) {
+; CHECK-LABEL: ole_f64_branch:
+; CHECK:         .functype ole_f64_branch (f64, f64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.le $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label1
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB15_2: # %if.end
+; CHECK-NEXT:    end_block # label1:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ole double %a, %b
   br i1 %cmp, label %if.then, label %if.end
@@ -226,14 +270,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ugt_f64_branch:
-; CHECK:      local.get	$push[[L0:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L1:[0-9]+]]=, 1
-; CHECK-NEXT: f64.le  	$push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; CHECK-NEXT: i32.eqz   $push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ugt_f64_branch(double %a, double %b) {
+; CHECK-LABEL: ugt_f64_branch:
+; CHECK:         .functype ugt_f64_branch (f64, f64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.le $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label2
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB16_2: # %if.end
+; CHECK-NEXT:    end_block # label2:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ugt double %a, %b
   br i1 %cmp, label %if.end, label %if.then
@@ -246,14 +297,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ogt_f64_branch:
-; CHECK:      local.get	$push[[L0:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L1:[0-9]+]]=, 1
-; CHECK-NEXT: f64.gt  	$push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; CHECK-NEXT: i32.eqz   $push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ogt_f64_branch(double %a, double %b) {
+; CHECK-LABEL: ogt_f64_branch:
+; CHECK:         .functype ogt_f64_branch (f64, f64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.gt $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label3
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB17_2: # %if.end
+; CHECK-NEXT:    end_block # label3:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ogt double %a, %b
   br i1 %cmp, label %if.then, label %if.end
@@ -266,14 +324,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ult_f64_branch:
-; CHECK:      local.get	$push[[L0:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L1:[0-9]+]]=, 1
-; CHECK-NEXT: f64.ge  	$push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; CHECK-NEXT: i32.eqz   $push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ult_f64_branch(double %a, double %b) {
+; CHECK-LABEL: ult_f64_branch:
+; CHECK:         .functype ult_f64_branch (f64, f64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.ge $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label4
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB18_2: # %if.end
+; CHECK-NEXT:    end_block # label4:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ult double %a, %b
   br i1 %cmp, label %if.end, label %if.then
@@ -286,14 +351,21 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: ule_f64_branch:
-; CHECK:      local.get	$push[[L0:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L1:[0-9]+]]=, 1
-; CHECK-NEXT: f64.gt  	$push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; CHECK-NEXT: i32.eqz   $push[[NUM3:[0-9]+]]=, $pop[[NUM0]]
-; CHECK-NEXT: br_if   	0, $pop[[NUM3]]
-; CHECK-NEXT: call	call1
 define void @ule_f64_branch(double %a, double %b) {
+; CHECK-LABEL: ule_f64_branch:
+; CHECK:         .functype ule_f64_branch (f64, f64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.gt $push0=, $pop2, $pop1
+; CHECK-NEXT:    i32.eqz $push3=, $pop0
+; CHECK-NEXT:    br_if 0, $pop3 # 0: down to label5
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    call call1
+; CHECK-NEXT:  .LBB19_2: # %if.end
+; CHECK-NEXT:    end_block # label5:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ule double %a, %b
   br i1 %cmp, label %if.end, label %if.then
@@ -306,16 +378,31 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: xor_zext_switch
-; CHECK:      i32.const	$push[[L1:[0-9]+]]=, 0
-; CHECK-NEXT: br_if   	0, $pop[[L1]]
-; CHECK-NEXT: block
-; CHECK-NEXT: block
-; CHECK-NEXT: local.get	$push[[L3:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L2:[0-9]+]]=, 1
-; CHECK-NEXT: f64.ge  	$push[[L0:[0-9]+]]=, $pop[[L3]], $pop[[L2]]
-; CHECK-NEXT: br_table 	$pop[[L0]], 0, 1, 0
 define void @xor_zext_switch(double %a, double %b) {
+; CHECK-LABEL: xor_zext_switch:
+; CHECK:         .functype xor_zext_switch (f64, f64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    i32.const $push1=, 0
+; CHECK-NEXT:    br_if 0, $pop1 # 0: down to label6
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push3=, 0
+; CHECK-NEXT:    local.get $push2=, 1
+; CHECK-NEXT:    f64.ge $push0=, $pop3, $pop2
+; CHECK-NEXT:    br_table $pop0, 0, 1, 0 # 0: down to label8
+; CHECK-NEXT:    # 1: down to label7
+; CHECK-NEXT:  .LBB20_2: # %sw.bb.1
+; CHECK-NEXT:    end_block # label8:
+; CHECK-NEXT:    call foo1
+; CHECK-NEXT:    return
+; CHECK-NEXT:  .LBB20_3: # %sw.bb.2
+; CHECK-NEXT:    end_block # label7:
+; CHECK-NEXT:    call foo2
+; CHECK-NEXT:  .LBB20_4: # %exit
+; CHECK-NEXT:    end_block # label6:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ult double %a, %b
   %zext = zext i1 %cmp to i32
@@ -337,18 +424,41 @@ exit:
   ret void
 }
 
-; CHECK-LABEL: xor_add_switch
-; CHECK:      local.get	$push[[L8:[0-9]+]]=, 0
-; CHECK-NEXT: local.get	$push[[L7:[0-9]+]]=, 1
-; CHECK-NEXT: f64.ge  	$push[[L1:[0-9]+]]=, $pop[[L8]], $pop[[L7]]
-; CHECK-NEXT: i32.const	$push[[L2:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor 	$push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]
-; CHECK-NEXT: i32.const	$push[[L6:[0-9]+]]=, 1
-; CHECK-NEXT: i32.add 	$push[[L4:[0-9]+]]=, $pop[[L3]], $pop[[L6]]
-; CHECK-NEXT: i32.const	$push[[L5:[0-9]+]]=, 1
-; CHECK-NEXT: i32.xor 	$push[[L0:[0-9]+]]=, $pop[[L4]], $pop[[L5]]
-; CHECK-NEXT: br_table 	$pop[[L0]], 0, 1, 2, 3
 define void @xor_add_switch(double %a, double %b) {
+; CHECK-LABEL: xor_add_switch:
+; CHECK:         .functype xor_add_switch (f64, f64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    local.get $push7=, 1
+; CHECK-NEXT:    f64.ge $push1=, $pop8, $pop7
+; CHECK-NEXT:    i32.const $push2=, 1
+; CHECK-NEXT:    i32.xor $push3=, $pop1, $pop2
+; CHECK-NEXT:    i32.const $push6=, 1
+; CHECK-NEXT:    i32.add $push4=, $pop3, $pop6
+; CHECK-NEXT:    i32.const $push5=, 1
+; CHECK-NEXT:    i32.xor $push0=, $pop4, $pop5
+; CHECK-NEXT:    br_table $pop0, 0, 1, 2, 3 # 0: down to label12
+; CHECK-NEXT:    # 1: down to label11
+; CHECK-NEXT:    # 2: down to label10
+; CHECK-NEXT:    # 3: down to label9
+; CHECK-NEXT:  .LBB21_1: # %sw.bb.1
+; CHECK-NEXT:    end_block # label12:
+; CHECK-NEXT:    call foo1
+; CHECK-NEXT:    return
+; CHECK-NEXT:  .LBB21_2: # %sw.bb.2
+; CHECK-NEXT:    end_block # label11:
+; CHECK-NEXT:    call foo2
+; CHECK-NEXT:    return
+; CHECK-NEXT:  .LBB21_3: # %sw.bb.3
+; CHECK-NEXT:    end_block # label10:
+; CHECK-NEXT:    call foo3
+; CHECK-NEXT:  .LBB21_4: # %exit
+; CHECK-NEXT:    end_block # label9:
+; CHECK-NEXT:    return
 entry:
   %cmp = fcmp ult double %a, %b
   %zext = zext i1 %cmp to i32

diff  --git a/llvm/test/CodeGen/WebAssembly/f32.ll b/llvm/test/CodeGen/WebAssembly/f32.ll
index 1a75245b6f183..6a3f31ff27869 100644
--- a/llvm/test/CodeGen/WebAssembly/f32.ll
+++ b/llvm/test/CodeGen/WebAssembly/f32.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
 
 ; Test that basic 32-bit floating-point operations assemble as expected.
 
@@ -14,171 +15,241 @@ declare float @llvm.nearbyint.f32(float)
 declare float @llvm.rint.f32(float)
 declare float @llvm.fma.f32(float, float, float)
 
-; CHECK-LABEL: fadd32:
-; CHECK-NEXT: .functype fadd32 (f32, f32) -> (f32){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f32.add $push[[LR:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fadd32(float %x, float %y) {
+; CHECK-LABEL: fadd32:
+; CHECK:         .functype fadd32 (f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.add $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fadd float %x, %y
   ret float %a
 }
 
-; CHECK-LABEL: fsub32:
-; CHECK: f32.sub $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fsub32(float %x, float %y) {
+; CHECK-LABEL: fsub32:
+; CHECK:         .functype fsub32 (f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.sub $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fsub float %x, %y
   ret float %a
 }
 
-; CHECK-LABEL: fmul32:
-; CHECK: f32.mul $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fmul32(float %x, float %y) {
+; CHECK-LABEL: fmul32:
+; CHECK:         .functype fmul32 (f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.mul $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fmul float %x, %y
   ret float %a
 }
 
-; CHECK-LABEL: fdiv32:
-; CHECK: f32.div $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fdiv32(float %x, float %y) {
+; CHECK-LABEL: fdiv32:
+; CHECK:         .functype fdiv32 (f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.div $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fdiv float %x, %y
   ret float %a
 }
 
-; CHECK-LABEL: fabs32:
-; CHECK: f32.abs $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fabs32(float %x) {
+; CHECK-LABEL: fabs32:
+; CHECK:         .functype fabs32 (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f32.abs $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.fabs.f32(float %x)
   ret float %a
 }
 
-; CHECK-LABEL: fneg32:
-; CHECK: f32.neg $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fneg32(float %x) {
+; CHECK-LABEL: fneg32:
+; CHECK:         .functype fneg32 (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f32.neg $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fsub float -0., %x
   ret float %a
 }
 
-; CHECK-LABEL: copysign32:
-; CHECK: f32.copysign $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @copysign32(float %x, float %y) {
+; CHECK-LABEL: copysign32:
+; CHECK:         .functype copysign32 (f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.copysign $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.copysign.f32(float %x, float %y)
   ret float %a
 }
 
-; CHECK-LABEL: sqrt32:
-; CHECK: f32.sqrt $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @sqrt32(float %x) {
+; CHECK-LABEL: sqrt32:
+; CHECK:         .functype sqrt32 (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f32.sqrt $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.sqrt.f32(float %x)
   ret float %a
 }
 
-; CHECK-LABEL: ceil32:
-; CHECK: f32.ceil $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @ceil32(float %x) {
+; CHECK-LABEL: ceil32:
+; CHECK:         .functype ceil32 (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f32.ceil $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.ceil.f32(float %x)
   ret float %a
 }
 
-; CHECK-LABEL: floor32:
-; CHECK: f32.floor $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @floor32(float %x) {
+; CHECK-LABEL: floor32:
+; CHECK:         .functype floor32 (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f32.floor $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.floor.f32(float %x)
   ret float %a
 }
 
-; CHECK-LABEL: trunc32:
-; CHECK: f32.trunc $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @trunc32(float %x) {
+; CHECK-LABEL: trunc32:
+; CHECK:         .functype trunc32 (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f32.trunc $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.trunc.f32(float %x)
   ret float %a
 }
 
-; CHECK-LABEL: nearest32:
-; CHECK: f32.nearest $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @nearest32(float %x) {
+; CHECK-LABEL: nearest32:
+; CHECK:         .functype nearest32 (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f32.nearest $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.nearbyint.f32(float %x)
   ret float %a
 }
 
-; CHECK-LABEL: nearest32_via_rint:
-; CHECK: f32.nearest $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @nearest32_via_rint(float %x) {
+; CHECK-LABEL: nearest32_via_rint:
+; CHECK:         .functype nearest32_via_rint (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f32.nearest $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.rint.f32(float %x)
   ret float %a
 }
 
-; CHECK-LABEL: fmin32:
-; CHECK: f32.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
-; CHECK-NEXT: return $pop1{{$}}
 define float @fmin32(float %x) {
+; CHECK-LABEL: fmin32:
+; CHECK:         .functype fmin32 (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    f32.const $push0=, 0x0p0
+; CHECK-NEXT:    f32.min $push1=, $pop2, $pop0
+; CHECK-NEXT:    return $pop1
   %a = fcmp ult float %x, 0.0
   %b = select i1 %a, float %x, float 0.0
   ret float %b
 }
 
-; CHECK-LABEL: fmax32:
-; CHECK: f32.max $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
-; CHECK-NEXT: return $pop1{{$}}
 define float @fmax32(float %x) {
+; CHECK-LABEL: fmax32:
+; CHECK:         .functype fmax32 (f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    f32.const $push0=, 0x0p0
+; CHECK-NEXT:    f32.max $push1=, $pop2, $pop0
+; CHECK-NEXT:    return $pop1
   %a = fcmp ugt float %x, 0.0
   %b = select i1 %a, float %x, float 0.0
   ret float %b
 }
 
-; CHECK-LABEL: fmin32_intrinsic:
-; CHECK: f32.min $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 declare float @llvm.minimum.f32(float, float)
 define float @fmin32_intrinsic(float %x, float %y) {
+; CHECK-LABEL: fmin32_intrinsic:
+; CHECK:         .functype fmin32_intrinsic (f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.min $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.minimum.f32(float %x, float %y)
   ret float %a
 }
 
-; CHECK-LABEL: fminnum32_intrinsic:
-; CHECK: f32.min $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 declare float @llvm.minnum.f32(float, float)
 define float @fminnum32_intrinsic(float %x, float %y) {
+; CHECK-LABEL: fminnum32_intrinsic:
+; CHECK:         .functype fminnum32_intrinsic (f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.min $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call nnan float @llvm.minnum.f32(float %x, float %y)
   ret float %a
 }
 
-; CHECK-LABEL: fmax32_intrinsic:
-; CHECK: f32.max $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 declare float @llvm.maximum.f32(float, float)
 define float @fmax32_intrinsic(float %x, float %y) {
+; CHECK-LABEL: fmax32_intrinsic:
+; CHECK:         .functype fmax32_intrinsic (f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.max $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call float @llvm.maximum.f32(float %x, float %y)
   ret float %a
 }
 
-; CHECK-LABEL: fmaxnum32_intrinsic:
-; CHECK: f32.max $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 declare float @llvm.maxnum.f32(float, float)
 define float @fmaxnum32_intrinsic(float %x, float %y) {
+; CHECK-LABEL: fmaxnum32_intrinsic:
+; CHECK:         .functype fmaxnum32_intrinsic (f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f32.max $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call nnan float @llvm.maxnum.f32(float %x, float %y)
   ret float %a
 }
 
-; CHECK-LABEL: fma32:
-; CHECK: {{^}} call $push[[LR:[0-9]+]]=, fmaf, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fma32(float %a, float %b, float %c) {
+; CHECK-LABEL: fma32:
+; CHECK:         .functype fma32 (f32, f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push3=, 0
+; CHECK-NEXT:    local.get $push2=, 1
+; CHECK-NEXT:    local.get $push1=, 2
+; CHECK-NEXT:    call $push0=, fmaf, $pop3, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %d = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %d
 }

diff  --git a/llvm/test/CodeGen/WebAssembly/f64.ll b/llvm/test/CodeGen/WebAssembly/f64.ll
index 138044f06ac09..ab31b5dfc73f6 100644
--- a/llvm/test/CodeGen/WebAssembly/f64.ll
+++ b/llvm/test/CodeGen/WebAssembly/f64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
 
 ; Test that basic 64-bit floating-point operations assemble as expected.
 
@@ -14,153 +15,215 @@ declare double @llvm.nearbyint.f64(double)
 declare double @llvm.rint.f64(double)
 declare double @llvm.fma.f64(double, double, double)
 
-; CHECK-LABEL: fadd64:
-; CHECK-NEXT: .functype fadd64 (f64, f64) -> (f64){{$}}
-; CHECK-NEXT: local.get $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: local.get $push[[L1:[0-9]+]]=, 1{{$}}
-; CHECK-NEXT: f64.add $push[[LR:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fadd64(double %x, double %y) {
+; CHECK-LABEL: fadd64:
+; CHECK:         .functype fadd64 (f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.add $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fadd double %x, %y
   ret double %a
 }
 
-; CHECK-LABEL: fsub64:
-; CHECK: f64.sub $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fsub64(double %x, double %y) {
+; CHECK-LABEL: fsub64:
+; CHECK:         .functype fsub64 (f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.sub $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fsub double %x, %y
   ret double %a
 }
 
-; CHECK-LABEL: fmul64:
-; CHECK: f64.mul $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fmul64(double %x, double %y) {
+; CHECK-LABEL: fmul64:
+; CHECK:         .functype fmul64 (f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.mul $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fmul double %x, %y
   ret double %a
 }
 
-; CHECK-LABEL: fdiv64:
-; CHECK: f64.div $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fdiv64(double %x, double %y) {
+; CHECK-LABEL: fdiv64:
+; CHECK:         .functype fdiv64 (f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.div $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fdiv double %x, %y
   ret double %a
 }
 
-; CHECK-LABEL: fabs64:
-; CHECK: f64.abs $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fabs64(double %x) {
+; CHECK-LABEL: fabs64:
+; CHECK:         .functype fabs64 (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f64.abs $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.fabs.f64(double %x)
   ret double %a
 }
 
-; CHECK-LABEL: fneg64:
-; CHECK: f64.neg $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fneg64(double %x) {
+; CHECK-LABEL: fneg64:
+; CHECK:         .functype fneg64 (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f64.neg $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = fsub double -0., %x
   ret double %a
 }
 
-; CHECK-LABEL: copysign64:
-; CHECK: f64.copysign $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @copysign64(double %x, double %y) {
+; CHECK-LABEL: copysign64:
+; CHECK:         .functype copysign64 (f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.copysign $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.copysign.f64(double %x, double %y)
   ret double %a
 }
 
-; CHECK-LABEL: sqrt64:
-; CHECK: f64.sqrt $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @sqrt64(double %x) {
+; CHECK-LABEL: sqrt64:
+; CHECK:         .functype sqrt64 (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f64.sqrt $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.sqrt.f64(double %x)
   ret double %a
 }
 
-; CHECK-LABEL: ceil64:
-; CHECK: f64.ceil $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @ceil64(double %x) {
+; CHECK-LABEL: ceil64:
+; CHECK:         .functype ceil64 (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f64.ceil $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.ceil.f64(double %x)
   ret double %a
 }
 
-; CHECK-LABEL: floor64:
-; CHECK: f64.floor $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @floor64(double %x) {
+; CHECK-LABEL: floor64:
+; CHECK:         .functype floor64 (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f64.floor $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.floor.f64(double %x)
   ret double %a
 }
 
-; CHECK-LABEL: trunc64:
-; CHECK: f64.trunc $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @trunc64(double %x) {
+; CHECK-LABEL: trunc64:
+; CHECK:         .functype trunc64 (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f64.trunc $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.trunc.f64(double %x)
   ret double %a
 }
 
-; CHECK-LABEL: nearest64:
-; CHECK: f64.nearest $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @nearest64(double %x) {
+; CHECK-LABEL: nearest64:
+; CHECK:         .functype nearest64 (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f64.nearest $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.nearbyint.f64(double %x)
   ret double %a
 }
 
-; CHECK-LABEL: nearest64_via_rint:
-; CHECK: f64.nearest $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @nearest64_via_rint(double %x) {
+; CHECK-LABEL: nearest64_via_rint:
+; CHECK:         .functype nearest64_via_rint (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push1=, 0
+; CHECK-NEXT:    f64.nearest $push0=, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.rint.f64(double %x)
   ret double %a
 }
 
-; CHECK-LABEL: fmin64:
-; CHECK: f64.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
-; CHECK-NEXT: return $pop1{{$}}
 define double @fmin64(double %x) {
+; CHECK-LABEL: fmin64:
+; CHECK:         .functype fmin64 (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    f64.const $push0=, 0x0p0
+; CHECK-NEXT:    f64.min $push1=, $pop2, $pop0
+; CHECK-NEXT:    return $pop1
   %a = fcmp ult double %x, 0.0
   %b = select i1 %a, double %x, double 0.0
   ret double %b
 }
 
-; CHECK-LABEL: fmax64:
-; CHECK: f64.max $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
-; CHECK-NEXT: return $pop1{{$}}
 define double @fmax64(double %x) {
+; CHECK-LABEL: fmax64:
+; CHECK:         .functype fmax64 (f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    f64.const $push0=, 0x0p0
+; CHECK-NEXT:    f64.max $push1=, $pop2, $pop0
+; CHECK-NEXT:    return $pop1
   %a = fcmp ugt double %x, 0.0
   %b = select i1 %a, double %x, double 0.0
   ret double %b
 }
 
-; CHECK-LABEL: fmin64_intrinsic:
-; CHECK: f64.min $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 declare double @llvm.minimum.f64(double, double)
 define double @fmin64_intrinsic(double %x, double %y) {
+; CHECK-LABEL: fmin64_intrinsic:
+; CHECK:         .functype fmin64_intrinsic (f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.min $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.minimum.f64(double %x, double %y)
   ret double %a
 }
 
-; CHECK-LABEL: fmax64_intrinsic:
-; CHECK: f64.max $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 declare double @llvm.maximum.f64(double, double)
 define double @fmax64_intrinsic(double %x, double %y) {
+; CHECK-LABEL: fmax64_intrinsic:
+; CHECK:         .functype fmax64_intrinsic (f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push2=, 0
+; CHECK-NEXT:    local.get $push1=, 1
+; CHECK-NEXT:    f64.max $push0=, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %a = call double @llvm.maximum.f64(double %x, double %y)
   ret double %a
 }
 
-; CHECK-LABEL: fma64:
-; CHECK: {{^}} call $push[[LR:[0-9]+]]=, fma, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fma64(double %a, double %b, double %c) {
+; CHECK-LABEL: fma64:
+; CHECK:         .functype fma64 (f64, f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push3=, 0
+; CHECK-NEXT:    local.get $push2=, 1
+; CHECK-NEXT:    local.get $push1=, 2
+; CHECK-NEXT:    call $push0=, fma, $pop3, $pop2, $pop1
+; CHECK-NEXT:    return $pop0
   %d = call double @llvm.fma.f64(double %a, double %b, double %c)
   ret double %d
 }

diff  --git a/llvm/test/CodeGen/WebAssembly/i128.ll b/llvm/test/CodeGen/WebAssembly/i128.ll
index 6be1457b149d8..50d4680fd613d 100644
--- a/llvm/test/CodeGen/WebAssembly/i128.ll
+++ b/llvm/test/CodeGen/WebAssembly/i128.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
 
 ; Test that basic 128-bit integer operations assemble as expected.
 
@@ -8,208 +9,572 @@ declare i128 @llvm.ctlz.i128(i128, i1)
 declare i128 @llvm.cttz.i128(i128, i1)
 declare i128 @llvm.ctpop.i128(i128)
 
-; CHECK-LABEL: add128:
-; CHECK-NEXT: .functype add128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK-NOT:  .result
-; CHECK:      i64.add
-; CHECK:      i64.store
-; CHECK:      i64.add
-; CHECK:      i64.store
-; CHECK-NEXT: return{{$}}
 define i128 @add128(i128 %x, i128 %y) {
+; CHECK-LABEL: add128:
+; CHECK:         .functype add128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    local.get $push7=, 1
+; CHECK-NEXT:    local.get $push6=, 3
+; CHECK-NEXT:    i64.add $push5=, $pop7, $pop6
+; CHECK-NEXT:    local.tee $push4=, 3, $pop5
+; CHECK-NEXT:    i64.store 0($pop8), $pop4
+; CHECK-NEXT:    local.get $push13=, 0
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    local.get $push9=, 4
+; CHECK-NEXT:    i64.add $push0=, $pop10, $pop9
+; CHECK-NEXT:    local.get $push12=, 3
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    i64.lt_u $push1=, $pop12, $pop11
+; CHECK-NEXT:    i64.extend_i32_u $push2=, $pop1
+; CHECK-NEXT:    i64.add $push3=, $pop0, $pop2
+; CHECK-NEXT:    i64.store 8($pop13), $pop3
+; CHECK-NEXT:    return
   %a = add i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: sub128:
-; CHECK-NEXT: .functype sub128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK:      i64.sub
-; CHECK:      i64.store
-; CHECK:      i64.sub
-; CHECK:      i64.store
-; CHECK-NEXT: return{{$}}
 define i128 @sub128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub128:
+; CHECK:         .functype sub128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push7=, 0
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    local.get $push5=, 3
+; CHECK-NEXT:    i64.sub $push0=, $pop6, $pop5
+; CHECK-NEXT:    i64.store 0($pop7), $pop0
+; CHECK-NEXT:    local.get $push12=, 0
+; CHECK-NEXT:    local.get $push9=, 2
+; CHECK-NEXT:    local.get $push8=, 4
+; CHECK-NEXT:    i64.sub $push1=, $pop9, $pop8
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    local.get $push10=, 3
+; CHECK-NEXT:    i64.lt_u $push2=, $pop11, $pop10
+; CHECK-NEXT:    i64.extend_i32_u $push3=, $pop2
+; CHECK-NEXT:    i64.sub $push4=, $pop1, $pop3
+; CHECK-NEXT:    i64.store 8($pop12), $pop4
+; CHECK-NEXT:    return
   %a = sub i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: mul128:
-; CHECK-NEXT: .functype mul128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __multi3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @mul128(i128 %x, i128 %y) {
+; CHECK-LABEL: mul128:
+; CHECK:         .functype mul128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push4=, __stack_pointer
+; CHECK-NEXT:    i32.const $push5=, 16
+; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
+; CHECK-NEXT:    local.tee $push8=, 5, $pop9
+; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    local.get $push14=, 5
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    local.get $push12=, 2
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    local.get $push10=, 4
+; CHECK-NEXT:    call __multi3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 5
+; CHECK-NEXT:    i32.const $push0=, 8
+; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
+; CHECK-NEXT:    i64.load $push2=, 0($pop1)
+; CHECK-NEXT:    i64.store 8($pop16), $pop2
+; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    local.get $push17=, 5
+; CHECK-NEXT:    i64.load $push3=, 0($pop17)
+; CHECK-NEXT:    i64.store 0($pop18), $pop3
+; CHECK-NEXT:    local.get $push19=, 5
+; CHECK-NEXT:    i32.const $push6=, 16
+; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    return
   %a = mul i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: sdiv128:
-; CHECK-NEXT: .functype sdiv128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __divti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @sdiv128(i128 %x, i128 %y) {
+; CHECK-LABEL: sdiv128:
+; CHECK:         .functype sdiv128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push4=, __stack_pointer
+; CHECK-NEXT:    i32.const $push5=, 16
+; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
+; CHECK-NEXT:    local.tee $push8=, 5, $pop9
+; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    local.get $push14=, 5
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    local.get $push12=, 2
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    local.get $push10=, 4
+; CHECK-NEXT:    call __divti3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 5
+; CHECK-NEXT:    i32.const $push0=, 8
+; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
+; CHECK-NEXT:    i64.load $push2=, 0($pop1)
+; CHECK-NEXT:    i64.store 8($pop16), $pop2
+; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    local.get $push17=, 5
+; CHECK-NEXT:    i64.load $push3=, 0($pop17)
+; CHECK-NEXT:    i64.store 0($pop18), $pop3
+; CHECK-NEXT:    local.get $push19=, 5
+; CHECK-NEXT:    i32.const $push6=, 16
+; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    return
   %a = sdiv i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: udiv128:
-; CHECK-NEXT: .functype udiv128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __udivti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @udiv128(i128 %x, i128 %y) {
+; CHECK-LABEL: udiv128:
+; CHECK:         .functype udiv128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push4=, __stack_pointer
+; CHECK-NEXT:    i32.const $push5=, 16
+; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
+; CHECK-NEXT:    local.tee $push8=, 5, $pop9
+; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    local.get $push14=, 5
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    local.get $push12=, 2
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    local.get $push10=, 4
+; CHECK-NEXT:    call __udivti3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 5
+; CHECK-NEXT:    i32.const $push0=, 8
+; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
+; CHECK-NEXT:    i64.load $push2=, 0($pop1)
+; CHECK-NEXT:    i64.store 8($pop16), $pop2
+; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    local.get $push17=, 5
+; CHECK-NEXT:    i64.load $push3=, 0($pop17)
+; CHECK-NEXT:    i64.store 0($pop18), $pop3
+; CHECK-NEXT:    local.get $push19=, 5
+; CHECK-NEXT:    i32.const $push6=, 16
+; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    return
   %a = udiv i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: srem128:
-; CHECK-NEXT: .functype srem128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __modti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @srem128(i128 %x, i128 %y) {
+; CHECK-LABEL: srem128:
+; CHECK:         .functype srem128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push4=, __stack_pointer
+; CHECK-NEXT:    i32.const $push5=, 16
+; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
+; CHECK-NEXT:    local.tee $push8=, 5, $pop9
+; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    local.get $push14=, 5
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    local.get $push12=, 2
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    local.get $push10=, 4
+; CHECK-NEXT:    call __modti3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 5
+; CHECK-NEXT:    i32.const $push0=, 8
+; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
+; CHECK-NEXT:    i64.load $push2=, 0($pop1)
+; CHECK-NEXT:    i64.store 8($pop16), $pop2
+; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    local.get $push17=, 5
+; CHECK-NEXT:    i64.load $push3=, 0($pop17)
+; CHECK-NEXT:    i64.store 0($pop18), $pop3
+; CHECK-NEXT:    local.get $push19=, 5
+; CHECK-NEXT:    i32.const $push6=, 16
+; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    return
   %a = srem i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: urem128:
-; CHECK-NEXT: .functype urem128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __umodti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @urem128(i128 %x, i128 %y) {
+; CHECK-LABEL: urem128:
+; CHECK:         .functype urem128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push4=, __stack_pointer
+; CHECK-NEXT:    i32.const $push5=, 16
+; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
+; CHECK-NEXT:    local.tee $push8=, 5, $pop9
+; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    local.get $push14=, 5
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    local.get $push12=, 2
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    local.get $push10=, 4
+; CHECK-NEXT:    call __umodti3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 5
+; CHECK-NEXT:    i32.const $push0=, 8
+; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
+; CHECK-NEXT:    i64.load $push2=, 0($pop1)
+; CHECK-NEXT:    i64.store 8($pop16), $pop2
+; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    local.get $push17=, 5
+; CHECK-NEXT:    i64.load $push3=, 0($pop17)
+; CHECK-NEXT:    i64.store 0($pop18), $pop3
+; CHECK-NEXT:    local.get $push19=, 5
+; CHECK-NEXT:    i32.const $push6=, 16
+; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    return
   %a = urem i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: and128:
-; CHECK-NEXT: .functype and128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK-NOT:  .result
-; CHECK:      i64.and
-; CHECK:      i64.store
-; CHECK:      i64.and
-; CHECK:      i64.store
-; CHECK-NEXT: return{{$}}
 define i128 @and128(i128 %x, i128 %y) {
+; CHECK-LABEL: and128:
+; CHECK:         .functype and128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 2
+; CHECK-NEXT:    local.get $push2=, 4
+; CHECK-NEXT:    i64.and $push0=, $pop3, $pop2
+; CHECK-NEXT:    i64.store 8($pop4), $pop0
+; CHECK-NEXT:    local.get $push7=, 0
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    local.get $push5=, 3
+; CHECK-NEXT:    i64.and $push1=, $pop6, $pop5
+; CHECK-NEXT:    i64.store 0($pop7), $pop1
+; CHECK-NEXT:    return
   %a = and i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: or128:
-; CHECK-NEXT: .functype or128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK:      i64.or
-; CHECK:      i64.store
-; CHECK:      i64.or
-; CHECK:      i64.store
-; CHECK-NEXT: return{{$}}
 define i128 @or128(i128 %x, i128 %y) {
+; CHECK-LABEL: or128:
+; CHECK:         .functype or128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 2
+; CHECK-NEXT:    local.get $push2=, 4
+; CHECK-NEXT:    i64.or $push0=, $pop3, $pop2
+; CHECK-NEXT:    i64.store 8($pop4), $pop0
+; CHECK-NEXT:    local.get $push7=, 0
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    local.get $push5=, 3
+; CHECK-NEXT:    i64.or $push1=, $pop6, $pop5
+; CHECK-NEXT:    i64.store 0($pop7), $pop1
+; CHECK-NEXT:    return
   %a = or i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: xor128:
-; CHECK-NEXT: .functype xor128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK:      i64.xor
-; CHECK:      i64.store
-; CHECK:      i64.xor
-; CHECK:      i64.store
-; CHECK-NEXT: return{{$}}
 define i128 @xor128(i128 %x, i128 %y) {
+; CHECK-LABEL: xor128:
+; CHECK:         .functype xor128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 2
+; CHECK-NEXT:    local.get $push2=, 4
+; CHECK-NEXT:    i64.xor $push0=, $pop3, $pop2
+; CHECK-NEXT:    i64.store 8($pop4), $pop0
+; CHECK-NEXT:    local.get $push7=, 0
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    local.get $push5=, 3
+; CHECK-NEXT:    i64.xor $push1=, $pop6, $pop5
+; CHECK-NEXT:    i64.store 0($pop7), $pop1
+; CHECK-NEXT:    return
   %a = xor i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: shl128:
-; CHECK-NEXT: .functype shl128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __ashlti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @shl128(i128 %x, i128 %y) {
+; CHECK-LABEL: shl128:
+; CHECK:         .functype shl128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push5=, __stack_pointer
+; CHECK-NEXT:    i32.const $push6=, 16
+; CHECK-NEXT:    i32.sub $push10=, $pop5, $pop6
+; CHECK-NEXT:    local.tee $push9=, 5, $pop10
+; CHECK-NEXT:    global.set __stack_pointer, $pop9
+; CHECK-NEXT:    local.get $push14=, 5
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    local.get $push12=, 2
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop11
+; CHECK-NEXT:    call __ashlti3, $pop14, $pop13, $pop12, $pop0
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 5
+; CHECK-NEXT:    i32.const $push1=, 8
+; CHECK-NEXT:    i32.add $push2=, $pop15, $pop1
+; CHECK-NEXT:    i64.load $push3=, 0($pop2)
+; CHECK-NEXT:    i64.store 8($pop16), $pop3
+; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    local.get $push17=, 5
+; CHECK-NEXT:    i64.load $push4=, 0($pop17)
+; CHECK-NEXT:    i64.store 0($pop18), $pop4
+; CHECK-NEXT:    local.get $push19=, 5
+; CHECK-NEXT:    i32.const $push7=, 16
+; CHECK-NEXT:    i32.add $push8=, $pop19, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    return
   %a = shl i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: shr128:
-; CHECK-NEXT: .functype shr128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __lshrti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @shr128(i128 %x, i128 %y) {
+; CHECK-LABEL: shr128:
+; CHECK:         .functype shr128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push5=, __stack_pointer
+; CHECK-NEXT:    i32.const $push6=, 16
+; CHECK-NEXT:    i32.sub $push10=, $pop5, $pop6
+; CHECK-NEXT:    local.tee $push9=, 5, $pop10
+; CHECK-NEXT:    global.set __stack_pointer, $pop9
+; CHECK-NEXT:    local.get $push14=, 5
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    local.get $push12=, 2
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop11
+; CHECK-NEXT:    call __lshrti3, $pop14, $pop13, $pop12, $pop0
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 5
+; CHECK-NEXT:    i32.const $push1=, 8
+; CHECK-NEXT:    i32.add $push2=, $pop15, $pop1
+; CHECK-NEXT:    i64.load $push3=, 0($pop2)
+; CHECK-NEXT:    i64.store 8($pop16), $pop3
+; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    local.get $push17=, 5
+; CHECK-NEXT:    i64.load $push4=, 0($pop17)
+; CHECK-NEXT:    i64.store 0($pop18), $pop4
+; CHECK-NEXT:    local.get $push19=, 5
+; CHECK-NEXT:    i32.const $push7=, 16
+; CHECK-NEXT:    i32.add $push8=, $pop19, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    return
   %a = lshr i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: sar128:
-; CHECK-NEXT: .functype sar128 (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __ashrti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @sar128(i128 %x, i128 %y) {
+; CHECK-LABEL: sar128:
+; CHECK:         .functype sar128 (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push5=, __stack_pointer
+; CHECK-NEXT:    i32.const $push6=, 16
+; CHECK-NEXT:    i32.sub $push10=, $pop5, $pop6
+; CHECK-NEXT:    local.tee $push9=, 5, $pop10
+; CHECK-NEXT:    global.set __stack_pointer, $pop9
+; CHECK-NEXT:    local.get $push14=, 5
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    local.get $push12=, 2
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop11
+; CHECK-NEXT:    call __ashrti3, $pop14, $pop13, $pop12, $pop0
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 5
+; CHECK-NEXT:    i32.const $push1=, 8
+; CHECK-NEXT:    i32.add $push2=, $pop15, $pop1
+; CHECK-NEXT:    i64.load $push3=, 0($pop2)
+; CHECK-NEXT:    i64.store 8($pop16), $pop3
+; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    local.get $push17=, 5
+; CHECK-NEXT:    i64.load $push4=, 0($pop17)
+; CHECK-NEXT:    i64.store 0($pop18), $pop4
+; CHECK-NEXT:    local.get $push19=, 5
+; CHECK-NEXT:    i32.const $push7=, 16
+; CHECK-NEXT:    i32.add $push8=, $pop19, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    return
   %a = ashr i128 %x, %y
   ret i128 %a
 }
 
-; CHECK-LABEL: clz128:
-; CHECK-NEXT: .functype clz128 (i32, i64, i64) -> (){{$}}
-; CHECK-NOT:  .result
-; CHECK:      i64.clz
-; CHECK:      i64.clz
-; CHECK:      return{{$}}
 define i128 @clz128(i128 %x) {
+; CHECK-LABEL: clz128:
+; CHECK:         .functype clz128 (i32, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    i64.const $push0=, 0
+; CHECK-NEXT:    i64.store 8($pop8), $pop0
+; CHECK-NEXT:    local.get $push12=, 0
+; CHECK-NEXT:    local.get $push9=, 2
+; CHECK-NEXT:    i64.clz $push5=, $pop9
+; CHECK-NEXT:    local.get $push10=, 1
+; CHECK-NEXT:    i64.clz $push2=, $pop10
+; CHECK-NEXT:    i64.const $push3=, 64
+; CHECK-NEXT:    i64.add $push4=, $pop2, $pop3
+; CHECK-NEXT:    local.get $push11=, 2
+; CHECK-NEXT:    i64.const $push7=, 0
+; CHECK-NEXT:    i64.ne $push1=, $pop11, $pop7
+; CHECK-NEXT:    i64.select $push6=, $pop5, $pop4, $pop1
+; CHECK-NEXT:    i64.store 0($pop12), $pop6
+; CHECK-NEXT:    return
   %a = call i128 @llvm.ctlz.i128(i128 %x, i1 false)
   ret i128 %a
 }
 
-; CHECK-LABEL: clz128_zero_undef:
-; CHECK-NEXT: .functype clz128_zero_undef (i32, i64, i64) -> (){{$}}
-; CHECK:      i64.clz
-; CHECK:      i64.clz
-; CHECK:      return{{$}}
 define i128 @clz128_zero_undef(i128 %x) {
+; CHECK-LABEL: clz128_zero_undef:
+; CHECK:         .functype clz128_zero_undef (i32, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    i64.const $push0=, 0
+; CHECK-NEXT:    i64.store 8($pop8), $pop0
+; CHECK-NEXT:    local.get $push12=, 0
+; CHECK-NEXT:    local.get $push9=, 2
+; CHECK-NEXT:    i64.clz $push5=, $pop9
+; CHECK-NEXT:    local.get $push10=, 1
+; CHECK-NEXT:    i64.clz $push2=, $pop10
+; CHECK-NEXT:    i64.const $push3=, 64
+; CHECK-NEXT:    i64.add $push4=, $pop2, $pop3
+; CHECK-NEXT:    local.get $push11=, 2
+; CHECK-NEXT:    i64.const $push7=, 0
+; CHECK-NEXT:    i64.ne $push1=, $pop11, $pop7
+; CHECK-NEXT:    i64.select $push6=, $pop5, $pop4, $pop1
+; CHECK-NEXT:    i64.store 0($pop12), $pop6
+; CHECK-NEXT:    return
   %a = call i128 @llvm.ctlz.i128(i128 %x, i1 true)
   ret i128 %a
 }
 
-; CHECK-LABEL: ctz128:
-; CHECK-NEXT: .functype ctz128 (i32, i64, i64) -> (){{$}}
-; CHECK:      i64.ctz
-; CHECK:      i64.ctz
-; CHECK:      return{{$}}
 define i128 @ctz128(i128 %x) {
+; CHECK-LABEL: ctz128:
+; CHECK:         .functype ctz128 (i32, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    i64.const $push0=, 0
+; CHECK-NEXT:    i64.store 8($pop8), $pop0
+; CHECK-NEXT:    local.get $push12=, 0
+; CHECK-NEXT:    local.get $push9=, 1
+; CHECK-NEXT:    i64.ctz $push5=, $pop9
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    i64.ctz $push2=, $pop10
+; CHECK-NEXT:    i64.const $push3=, 64
+; CHECK-NEXT:    i64.add $push4=, $pop2, $pop3
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    i64.const $push7=, 0
+; CHECK-NEXT:    i64.ne $push1=, $pop11, $pop7
+; CHECK-NEXT:    i64.select $push6=, $pop5, $pop4, $pop1
+; CHECK-NEXT:    i64.store 0($pop12), $pop6
+; CHECK-NEXT:    return
   %a = call i128 @llvm.cttz.i128(i128 %x, i1 false)
   ret i128 %a
 }
 
-; CHECK-LABEL: ctz128_zero_undef:
-; CHECK-NEXT: .functype ctz128_zero_undef (i32, i64, i64) -> (){{$}}
-; CHECK:      i64.ctz
-; CHECK:      i64.ctz
-; CHECK:      return{{$}}
 define i128 @ctz128_zero_undef(i128 %x) {
+; CHECK-LABEL: ctz128_zero_undef:
+; CHECK:         .functype ctz128_zero_undef (i32, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    i64.const $push0=, 0
+; CHECK-NEXT:    i64.store 8($pop8), $pop0
+; CHECK-NEXT:    local.get $push12=, 0
+; CHECK-NEXT:    local.get $push9=, 1
+; CHECK-NEXT:    i64.ctz $push5=, $pop9
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    i64.ctz $push2=, $pop10
+; CHECK-NEXT:    i64.const $push3=, 64
+; CHECK-NEXT:    i64.add $push4=, $pop2, $pop3
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    i64.const $push7=, 0
+; CHECK-NEXT:    i64.ne $push1=, $pop11, $pop7
+; CHECK-NEXT:    i64.select $push6=, $pop5, $pop4, $pop1
+; CHECK-NEXT:    i64.store 0($pop12), $pop6
+; CHECK-NEXT:    return
   %a = call i128 @llvm.cttz.i128(i128 %x, i1 true)
   ret i128 %a
 }
 
-; CHECK-LABEL: popcnt128:
-; CHECK-NEXT: .functype popcnt128 (i32, i64, i64) -> (){{$}}
-; CHECK:      i64.popcnt
-; CHECK:      i64.popcnt
-; CHECK:      return{{$}}
 define i128 @popcnt128(i128 %x) {
+; CHECK-LABEL: popcnt128:
+; CHECK:         .functype popcnt128 (i32, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    i64.const $push0=, 0
+; CHECK-NEXT:    i64.store 8($pop4), $pop0
+; CHECK-NEXT:    local.get $push7=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i64.popcnt $push2=, $pop5
+; CHECK-NEXT:    local.get $push6=, 2
+; CHECK-NEXT:    i64.popcnt $push1=, $pop6
+; CHECK-NEXT:    i64.add $push3=, $pop2, $pop1
+; CHECK-NEXT:    i64.store 0($pop7), $pop3
+; CHECK-NEXT:    return
   %a = call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %a
 }
 
-; CHECK-LABEL: eqz128:
-; CHECK-NEXT: .functype eqz128 (i64, i64) -> (i32){{$}}
-; CHECK:     i64.or
-; CHECK:     i64.eqz
-; CHECK:     return $
 define i32 @eqz128(i128 %x) {
+; CHECK-LABEL: eqz128:
+; CHECK:         .functype eqz128 (i64, i64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push3=, 0
+; CHECK-NEXT:    local.get $push2=, 1
+; CHECK-NEXT:    i64.or $push0=, $pop3, $pop2
+; CHECK-NEXT:    i64.eqz $push1=, $pop0
+; CHECK-NEXT:    return $pop1
   %a = icmp eq i128 %x, 0
   %b = zext i1 %a to i32
   ret i32 %b
 }
 
-; CHECK-LABEL: rotl:
-; CHECK-NEXT: .functype rotl (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __ashlti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: call __lshrti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @rotl(i128 %x, i128 %y) {
+; CHECK-LABEL: rotl:
+; CHECK:         .functype rotl (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32, i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push11=, __stack_pointer
+; CHECK-NEXT:    i32.const $push12=, 32
+; CHECK-NEXT:    i32.sub $push23=, $pop11, $pop12
+; CHECK-NEXT:    local.tee $push22=, 5, $pop23
+; CHECK-NEXT:    global.set __stack_pointer, $pop22
+; CHECK-NEXT:    local.get $push24=, 5
+; CHECK-NEXT:    i32.const $push15=, 16
+; CHECK-NEXT:    i32.add $push16=, $pop24, $pop15
+; CHECK-NEXT:    local.get $push27=, 1
+; CHECK-NEXT:    local.get $push26=, 2
+; CHECK-NEXT:    local.get $push25=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push21=, $pop25
+; CHECK-NEXT:    local.tee $push20=, 6, $pop21
+; CHECK-NEXT:    call __ashlti3, $pop16, $pop27, $pop26, $pop20
+; CHECK-NEXT:    local.get $push31=, 5
+; CHECK-NEXT:    local.get $push30=, 1
+; CHECK-NEXT:    local.get $push29=, 2
+; CHECK-NEXT:    i32.const $push0=, 128
+; CHECK-NEXT:    local.get $push28=, 6
+; CHECK-NEXT:    i32.sub $push1=, $pop0, $pop28
+; CHECK-NEXT:    call __lshrti3, $pop31, $pop30, $pop29, $pop1
+; CHECK-NEXT:    local.get $push34=, 0
+; CHECK-NEXT:    local.get $push32=, 5
+; CHECK-NEXT:    i32.const $push17=, 16
+; CHECK-NEXT:    i32.add $push18=, $pop32, $pop17
+; CHECK-NEXT:    i32.const $push2=, 8
+; CHECK-NEXT:    i32.add $push3=, $pop18, $pop2
+; CHECK-NEXT:    i64.load $push4=, 0($pop3)
+; CHECK-NEXT:    local.get $push33=, 5
+; CHECK-NEXT:    i32.const $push19=, 8
+; CHECK-NEXT:    i32.add $push5=, $pop33, $pop19
+; CHECK-NEXT:    i64.load $push6=, 0($pop5)
+; CHECK-NEXT:    i64.or $push7=, $pop4, $pop6
+; CHECK-NEXT:    i64.store 8($pop34), $pop7
+; CHECK-NEXT:    local.get $push37=, 0
+; CHECK-NEXT:    local.get $push35=, 5
+; CHECK-NEXT:    i64.load $push8=, 16($pop35)
+; CHECK-NEXT:    local.get $push36=, 5
+; CHECK-NEXT:    i64.load $push9=, 0($pop36)
+; CHECK-NEXT:    i64.or $push10=, $pop8, $pop9
+; CHECK-NEXT:    i64.store 0($pop37), $pop10
+; CHECK-NEXT:    local.get $push38=, 5
+; CHECK-NEXT:    i32.const $push13=, 32
+; CHECK-NEXT:    i32.add $push14=, $pop38, $pop13
+; CHECK-NEXT:    global.set __stack_pointer, $pop14
+; CHECK-NEXT:    return
   %z = sub i128 128, %y
   %b = shl i128 %x, %y
   %c = lshr i128 %x, %z
@@ -217,12 +582,59 @@ define i128 @rotl(i128 %x, i128 %y) {
   ret i128 %d
 }
 
-; CHECK-LABEL: masked_rotl:
-; CHECK-NEXT: .functype masked_rotl (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __ashlti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: call __lshrti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @masked_rotl(i128 %x, i128 %y) {
+; CHECK-LABEL: masked_rotl:
+; CHECK:         .functype masked_rotl (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32, i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push13=, __stack_pointer
+; CHECK-NEXT:    i32.const $push14=, 32
+; CHECK-NEXT:    i32.sub $push25=, $pop13, $pop14
+; CHECK-NEXT:    local.tee $push24=, 5, $pop25
+; CHECK-NEXT:    global.set __stack_pointer, $pop24
+; CHECK-NEXT:    local.get $push26=, 5
+; CHECK-NEXT:    i32.const $push17=, 16
+; CHECK-NEXT:    i32.add $push18=, $pop26, $pop17
+; CHECK-NEXT:    local.get $push29=, 1
+; CHECK-NEXT:    local.get $push28=, 2
+; CHECK-NEXT:    local.get $push27=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop27
+; CHECK-NEXT:    i32.const $push1=, 127
+; CHECK-NEXT:    i32.and $push23=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push22=, 6, $pop23
+; CHECK-NEXT:    call __ashlti3, $pop18, $pop29, $pop28, $pop22
+; CHECK-NEXT:    local.get $push33=, 5
+; CHECK-NEXT:    local.get $push32=, 1
+; CHECK-NEXT:    local.get $push31=, 2
+; CHECK-NEXT:    i32.const $push2=, 128
+; CHECK-NEXT:    local.get $push30=, 6
+; CHECK-NEXT:    i32.sub $push3=, $pop2, $pop30
+; CHECK-NEXT:    call __lshrti3, $pop33, $pop32, $pop31, $pop3
+; CHECK-NEXT:    local.get $push36=, 0
+; CHECK-NEXT:    local.get $push34=, 5
+; CHECK-NEXT:    i32.const $push19=, 16
+; CHECK-NEXT:    i32.add $push20=, $pop34, $pop19
+; CHECK-NEXT:    i32.const $push4=, 8
+; CHECK-NEXT:    i32.add $push5=, $pop20, $pop4
+; CHECK-NEXT:    i64.load $push6=, 0($pop5)
+; CHECK-NEXT:    local.get $push35=, 5
+; CHECK-NEXT:    i32.const $push21=, 8
+; CHECK-NEXT:    i32.add $push7=, $pop35, $pop21
+; CHECK-NEXT:    i64.load $push8=, 0($pop7)
+; CHECK-NEXT:    i64.or $push9=, $pop6, $pop8
+; CHECK-NEXT:    i64.store 8($pop36), $pop9
+; CHECK-NEXT:    local.get $push39=, 0
+; CHECK-NEXT:    local.get $push37=, 5
+; CHECK-NEXT:    i64.load $push10=, 16($pop37)
+; CHECK-NEXT:    local.get $push38=, 5
+; CHECK-NEXT:    i64.load $push11=, 0($pop38)
+; CHECK-NEXT:    i64.or $push12=, $pop10, $pop11
+; CHECK-NEXT:    i64.store 0($pop39), $pop12
+; CHECK-NEXT:    local.get $push40=, 5
+; CHECK-NEXT:    i32.const $push15=, 32
+; CHECK-NEXT:    i32.add $push16=, $pop40, $pop15
+; CHECK-NEXT:    global.set __stack_pointer, $pop16
+; CHECK-NEXT:    return
   %a = and i128 %y, 127
   %z = sub i128 128, %a
   %b = shl i128 %x, %a
@@ -231,12 +643,57 @@ define i128 @masked_rotl(i128 %x, i128 %y) {
   ret i128 %d
 }
 
-; CHECK-LABEL: rotr:
-; CHECK-NEXT: .functype rotr (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __lshrti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: call __ashlti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @rotr(i128 %x, i128 %y) {
+; CHECK-LABEL: rotr:
+; CHECK:         .functype rotr (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32, i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push11=, __stack_pointer
+; CHECK-NEXT:    i32.const $push12=, 32
+; CHECK-NEXT:    i32.sub $push23=, $pop11, $pop12
+; CHECK-NEXT:    local.tee $push22=, 5, $pop23
+; CHECK-NEXT:    global.set __stack_pointer, $pop22
+; CHECK-NEXT:    local.get $push24=, 5
+; CHECK-NEXT:    i32.const $push15=, 16
+; CHECK-NEXT:    i32.add $push16=, $pop24, $pop15
+; CHECK-NEXT:    local.get $push27=, 1
+; CHECK-NEXT:    local.get $push26=, 2
+; CHECK-NEXT:    local.get $push25=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push21=, $pop25
+; CHECK-NEXT:    local.tee $push20=, 6, $pop21
+; CHECK-NEXT:    call __lshrti3, $pop16, $pop27, $pop26, $pop20
+; CHECK-NEXT:    local.get $push31=, 5
+; CHECK-NEXT:    local.get $push30=, 1
+; CHECK-NEXT:    local.get $push29=, 2
+; CHECK-NEXT:    i32.const $push0=, 128
+; CHECK-NEXT:    local.get $push28=, 6
+; CHECK-NEXT:    i32.sub $push1=, $pop0, $pop28
+; CHECK-NEXT:    call __ashlti3, $pop31, $pop30, $pop29, $pop1
+; CHECK-NEXT:    local.get $push34=, 0
+; CHECK-NEXT:    local.get $push32=, 5
+; CHECK-NEXT:    i32.const $push17=, 16
+; CHECK-NEXT:    i32.add $push18=, $pop32, $pop17
+; CHECK-NEXT:    i32.const $push2=, 8
+; CHECK-NEXT:    i32.add $push3=, $pop18, $pop2
+; CHECK-NEXT:    i64.load $push4=, 0($pop3)
+; CHECK-NEXT:    local.get $push33=, 5
+; CHECK-NEXT:    i32.const $push19=, 8
+; CHECK-NEXT:    i32.add $push5=, $pop33, $pop19
+; CHECK-NEXT:    i64.load $push6=, 0($pop5)
+; CHECK-NEXT:    i64.or $push7=, $pop4, $pop6
+; CHECK-NEXT:    i64.store 8($pop34), $pop7
+; CHECK-NEXT:    local.get $push37=, 0
+; CHECK-NEXT:    local.get $push35=, 5
+; CHECK-NEXT:    i64.load $push8=, 16($pop35)
+; CHECK-NEXT:    local.get $push36=, 5
+; CHECK-NEXT:    i64.load $push9=, 0($pop36)
+; CHECK-NEXT:    i64.or $push10=, $pop8, $pop9
+; CHECK-NEXT:    i64.store 0($pop37), $pop10
+; CHECK-NEXT:    local.get $push38=, 5
+; CHECK-NEXT:    i32.const $push13=, 32
+; CHECK-NEXT:    i32.add $push14=, $pop38, $pop13
+; CHECK-NEXT:    global.set __stack_pointer, $pop14
+; CHECK-NEXT:    return
   %z = sub i128 128, %y
   %b = lshr i128 %x, %y
   %c = shl i128 %x, %z
@@ -244,12 +701,59 @@ define i128 @rotr(i128 %x, i128 %y) {
   ret i128 %d
 }
 
-; CHECK-LABEL: masked_rotr:
-; CHECK-NEXT: .functype masked_rotr (i32, i64, i64, i64, i64) -> (){{$}}
-; CHECK: call __lshrti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: call __ashlti3, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
-; CHECK: return{{$}}
 define i128 @masked_rotr(i128 %x, i128 %y) {
+; CHECK-LABEL: masked_rotr:
+; CHECK:         .functype masked_rotr (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32, i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push13=, __stack_pointer
+; CHECK-NEXT:    i32.const $push14=, 32
+; CHECK-NEXT:    i32.sub $push25=, $pop13, $pop14
+; CHECK-NEXT:    local.tee $push24=, 5, $pop25
+; CHECK-NEXT:    global.set __stack_pointer, $pop24
+; CHECK-NEXT:    local.get $push26=, 5
+; CHECK-NEXT:    i32.const $push17=, 16
+; CHECK-NEXT:    i32.add $push18=, $pop26, $pop17
+; CHECK-NEXT:    local.get $push29=, 1
+; CHECK-NEXT:    local.get $push28=, 2
+; CHECK-NEXT:    local.get $push27=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop27
+; CHECK-NEXT:    i32.const $push1=, 127
+; CHECK-NEXT:    i32.and $push23=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push22=, 6, $pop23
+; CHECK-NEXT:    call __lshrti3, $pop18, $pop29, $pop28, $pop22
+; CHECK-NEXT:    local.get $push33=, 5
+; CHECK-NEXT:    local.get $push32=, 1
+; CHECK-NEXT:    local.get $push31=, 2
+; CHECK-NEXT:    i32.const $push2=, 128
+; CHECK-NEXT:    local.get $push30=, 6
+; CHECK-NEXT:    i32.sub $push3=, $pop2, $pop30
+; CHECK-NEXT:    call __ashlti3, $pop33, $pop32, $pop31, $pop3
+; CHECK-NEXT:    local.get $push36=, 0
+; CHECK-NEXT:    local.get $push34=, 5
+; CHECK-NEXT:    i32.const $push19=, 16
+; CHECK-NEXT:    i32.add $push20=, $pop34, $pop19
+; CHECK-NEXT:    i32.const $push4=, 8
+; CHECK-NEXT:    i32.add $push5=, $pop20, $pop4
+; CHECK-NEXT:    i64.load $push6=, 0($pop5)
+; CHECK-NEXT:    local.get $push35=, 5
+; CHECK-NEXT:    i32.const $push21=, 8
+; CHECK-NEXT:    i32.add $push7=, $pop35, $pop21
+; CHECK-NEXT:    i64.load $push8=, 0($pop7)
+; CHECK-NEXT:    i64.or $push9=, $pop6, $pop8
+; CHECK-NEXT:    i64.store 8($pop36), $pop9
+; CHECK-NEXT:    local.get $push39=, 0
+; CHECK-NEXT:    local.get $push37=, 5
+; CHECK-NEXT:    i64.load $push10=, 16($pop37)
+; CHECK-NEXT:    local.get $push38=, 5
+; CHECK-NEXT:    i64.load $push11=, 0($pop38)
+; CHECK-NEXT:    i64.or $push12=, $pop10, $pop11
+; CHECK-NEXT:    i64.store 0($pop39), $pop12
+; CHECK-NEXT:    local.get $push40=, 5
+; CHECK-NEXT:    i32.const $push15=, 32
+; CHECK-NEXT:    i32.add $push16=, $pop40, $pop15
+; CHECK-NEXT:    global.set __stack_pointer, $pop16
+; CHECK-NEXT:    return
   %a = and i128 %y, 127
   %z = sub i128 128, %a
   %b = lshr i128 %x, %a

diff  --git a/llvm/test/CodeGen/WebAssembly/libcalls.ll b/llvm/test/CodeGen/WebAssembly/libcalls.ll
index efa4041c89142..d11f0d4fd23df 100644
--- a/llvm/test/CodeGen/WebAssembly/libcalls.ll
+++ b/llvm/test/CodeGen/WebAssembly/libcalls.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
 
 ; Test a subset of compiler-rt/libm libcalls expected to be emitted by the wasm backend
 
@@ -21,61 +22,239 @@ declare i32 @llvm.lround(double)
 
 
 
-; CHECK-LABEL: fp128libcalls:
 define fp128 @fp128libcalls(fp128 %x, fp128 %y, i32 %z) {
   ; compiler-rt call
-  ; CHECK: call __addtf3
+; CHECK-LABEL: fp128libcalls:
+; CHECK:         .functype fp128libcalls (i32, i64, i64, i64, i64, i32) -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push28=, __stack_pointer
+; CHECK-NEXT:    i32.const $push29=, 144
+; CHECK-NEXT:    i32.sub $push73=, $pop28, $pop29
+; CHECK-NEXT:    local.tee $push72=, 6, $pop73
+; CHECK-NEXT:    global.set __stack_pointer, $pop72
+; CHECK-NEXT:    local.get $push74=, 6
+; CHECK-NEXT:    i32.const $push32=, 128
+; CHECK-NEXT:    i32.add $push33=, $pop74, $pop32
+; CHECK-NEXT:    local.get $push78=, 1
+; CHECK-NEXT:    local.get $push77=, 2
+; CHECK-NEXT:    local.get $push76=, 3
+; CHECK-NEXT:    local.get $push75=, 4
+; CHECK-NEXT:    call __addtf3, $pop33, $pop78, $pop77, $pop76, $pop75
+; CHECK-NEXT:    local.get $push79=, 6
+; CHECK-NEXT:    i32.const $push36=, 112
+; CHECK-NEXT:    i32.add $push37=, $pop79, $pop36
+; CHECK-NEXT:    local.get $push80=, 6
+; CHECK-NEXT:    i64.load $push3=, 128($pop80)
+; CHECK-NEXT:    local.get $push81=, 6
+; CHECK-NEXT:    i32.const $push34=, 128
+; CHECK-NEXT:    i32.add $push35=, $pop81, $pop34
+; CHECK-NEXT:    i32.const $push0=, 8
+; CHECK-NEXT:    i32.add $push1=, $pop35, $pop0
+; CHECK-NEXT:    i64.load $push2=, 0($pop1)
+; CHECK-NEXT:    local.get $push83=, 3
+; CHECK-NEXT:    local.get $push82=, 4
+; CHECK-NEXT:    call __multf3, $pop37, $pop3, $pop2, $pop83, $pop82
+; CHECK-NEXT:    local.get $push84=, 6
+; CHECK-NEXT:    i32.const $push40=, 96
+; CHECK-NEXT:    i32.add $push41=, $pop84, $pop40
+; CHECK-NEXT:    local.get $push85=, 6
+; CHECK-NEXT:    i64.load $push6=, 112($pop85)
+; CHECK-NEXT:    local.get $push86=, 6
+; CHECK-NEXT:    i32.const $push38=, 112
+; CHECK-NEXT:    i32.add $push39=, $pop86, $pop38
+; CHECK-NEXT:    i32.const $push71=, 8
+; CHECK-NEXT:    i32.add $push4=, $pop39, $pop71
+; CHECK-NEXT:    i64.load $push5=, 0($pop4)
+; CHECK-NEXT:    local.get $push88=, 3
+; CHECK-NEXT:    local.get $push87=, 4
+; CHECK-NEXT:    call __divtf3, $pop41, $pop6, $pop5, $pop88, $pop87
+; CHECK-NEXT:    local.get $push89=, 6
+; CHECK-NEXT:    i32.const $push44=, 80
+; CHECK-NEXT:    i32.add $push45=, $pop89, $pop44
+; CHECK-NEXT:    local.get $push90=, 6
+; CHECK-NEXT:    i64.load $push9=, 96($pop90)
+; CHECK-NEXT:    local.get $push91=, 6
+; CHECK-NEXT:    i32.const $push42=, 96
+; CHECK-NEXT:    i32.add $push43=, $pop91, $pop42
+; CHECK-NEXT:    i32.const $push70=, 8
+; CHECK-NEXT:    i32.add $push7=, $pop43, $pop70
+; CHECK-NEXT:    i64.load $push8=, 0($pop7)
+; CHECK-NEXT:    call sqrtl, $pop45, $pop9, $pop8
+; CHECK-NEXT:    local.get $push92=, 6
+; CHECK-NEXT:    i32.const $push48=, 64
+; CHECK-NEXT:    i32.add $push49=, $pop92, $pop48
+; CHECK-NEXT:    local.get $push93=, 6
+; CHECK-NEXT:    i64.load $push12=, 80($pop93)
+; CHECK-NEXT:    local.get $push94=, 6
+; CHECK-NEXT:    i32.const $push46=, 80
+; CHECK-NEXT:    i32.add $push47=, $pop94, $pop46
+; CHECK-NEXT:    i32.const $push69=, 8
+; CHECK-NEXT:    i32.add $push10=, $pop47, $pop69
+; CHECK-NEXT:    i64.load $push11=, 0($pop10)
+; CHECK-NEXT:    call floorl, $pop49, $pop12, $pop11
+; CHECK-NEXT:    local.get $push95=, 6
+; CHECK-NEXT:    i32.const $push52=, 48
+; CHECK-NEXT:    i32.add $push53=, $pop95, $pop52
+; CHECK-NEXT:    local.get $push96=, 6
+; CHECK-NEXT:    i64.load $push15=, 64($pop96)
+; CHECK-NEXT:    local.get $push97=, 6
+; CHECK-NEXT:    i32.const $push50=, 64
+; CHECK-NEXT:    i32.add $push51=, $pop97, $pop50
+; CHECK-NEXT:    i32.const $push68=, 8
+; CHECK-NEXT:    i32.add $push13=, $pop51, $pop68
+; CHECK-NEXT:    i64.load $push14=, 0($pop13)
+; CHECK-NEXT:    local.get $push99=, 3
+; CHECK-NEXT:    local.get $push98=, 4
+; CHECK-NEXT:    call powl, $pop53, $pop15, $pop14, $pop99, $pop98
+; CHECK-NEXT:    local.get $push100=, 6
+; CHECK-NEXT:    i32.const $push56=, 32
+; CHECK-NEXT:    i32.add $push57=, $pop100, $pop56
+; CHECK-NEXT:    local.get $push101=, 6
+; CHECK-NEXT:    i64.load $push18=, 48($pop101)
+; CHECK-NEXT:    local.get $push102=, 6
+; CHECK-NEXT:    i32.const $push54=, 48
+; CHECK-NEXT:    i32.add $push55=, $pop102, $pop54
+; CHECK-NEXT:    i32.const $push67=, 8
+; CHECK-NEXT:    i32.add $push16=, $pop55, $pop67
+; CHECK-NEXT:    i64.load $push17=, 0($pop16)
+; CHECK-NEXT:    local.get $push103=, 5
+; CHECK-NEXT:    call __powitf2, $pop57, $pop18, $pop17, $pop103
+; CHECK-NEXT:    local.get $push104=, 6
+; CHECK-NEXT:    i32.const $push60=, 16
+; CHECK-NEXT:    i32.add $push61=, $pop104, $pop60
+; CHECK-NEXT:    local.get $push105=, 6
+; CHECK-NEXT:    i64.load $push21=, 32($pop105)
+; CHECK-NEXT:    local.get $push106=, 6
+; CHECK-NEXT:    i32.const $push58=, 32
+; CHECK-NEXT:    i32.add $push59=, $pop106, $pop58
+; CHECK-NEXT:    i32.const $push66=, 8
+; CHECK-NEXT:    i32.add $push19=, $pop59, $pop66
+; CHECK-NEXT:    i64.load $push20=, 0($pop19)
+; CHECK-NEXT:    call truncl, $pop61, $pop21, $pop20
+; CHECK-NEXT:    local.get $push109=, 6
+; CHECK-NEXT:    local.get $push107=, 6
+; CHECK-NEXT:    i64.load $push24=, 16($pop107)
+; CHECK-NEXT:    local.get $push108=, 6
+; CHECK-NEXT:    i32.const $push62=, 16
+; CHECK-NEXT:    i32.add $push63=, $pop108, $pop62
+; CHECK-NEXT:    i32.const $push65=, 8
+; CHECK-NEXT:    i32.add $push22=, $pop63, $pop65
+; CHECK-NEXT:    i64.load $push23=, 0($pop22)
+; CHECK-NEXT:    call nearbyintl, $pop109, $pop24, $pop23
+; CHECK-NEXT:    local.get $push111=, 0
+; CHECK-NEXT:    local.get $push110=, 6
+; CHECK-NEXT:    i32.const $push64=, 8
+; CHECK-NEXT:    i32.add $push25=, $pop110, $pop64
+; CHECK-NEXT:    i64.load $push26=, 0($pop25)
+; CHECK-NEXT:    i64.store 8($pop111), $pop26
+; CHECK-NEXT:    local.get $push113=, 0
+; CHECK-NEXT:    local.get $push112=, 6
+; CHECK-NEXT:    i64.load $push27=, 0($pop112)
+; CHECK-NEXT:    i64.store 0($pop113), $pop27
+; CHECK-NEXT:    local.get $push114=, 6
+; CHECK-NEXT:    i32.const $push30=, 144
+; CHECK-NEXT:    i32.add $push31=, $pop114, $pop30
+; CHECK-NEXT:    global.set __stack_pointer, $pop31
+; CHECK-NEXT:    return
   %a = fadd fp128 %x, %y
-  ; CHECK: call __multf3
   %b = fmul fp128 %a, %y
-  ; CHECK: call __divtf3
   %c = fdiv fp128 %b, %y
   ; libm calls
-  ; CHECK: call sqrtl
   %d = call fp128 @llvm.sqrt.f128(fp128 %c)
-  ; CHECK: call floorl
   %e = call fp128 @llvm.floor.f128(fp128 %d)
-  ; CHECK: call powl
   %f = call fp128 @llvm.pow.f128(fp128 %e, fp128 %y)
-  ; CHECK: call __powitf2
   %g = call fp128 @llvm.powi.f128.i32(fp128 %f, i32 %z)
-  ; CHECK: call truncl
   %h = call fp128 @llvm.trunc.f128(fp128 %g)
-  ; CHECK: call nearbyintl
   %i = call fp128 @llvm.nearbyint.f128(fp128 %h)
   ret fp128 %i
 }
 
-; CHECK-LABEL: i128libcalls:
 define i128 @i128libcalls(i128 %x, i128 %y) {
   ; Basic ops should be expanded
-  ; CHECK: .local
-  ; CHECK-NOT: call
+; CHECK-LABEL: i128libcalls:
+; CHECK:         .functype i128libcalls (i32, i64, i64, i64, i64) -> ()
+; CHECK-NEXT:    .local i32, i64
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push11=, __stack_pointer
+; CHECK-NEXT:    i32.const $push12=, 32
+; CHECK-NEXT:    i32.sub $push23=, $pop11, $pop12
+; CHECK-NEXT:    local.tee $push22=, 5, $pop23
+; CHECK-NEXT:    global.set __stack_pointer, $pop22
+; CHECK-NEXT:    local.get $push24=, 5
+; CHECK-NEXT:    i32.const $push15=, 16
+; CHECK-NEXT:    i32.add $push16=, $pop24, $pop15
+; CHECK-NEXT:    local.get $push26=, 1
+; CHECK-NEXT:    local.get $push25=, 3
+; CHECK-NEXT:    i64.add $push21=, $pop26, $pop25
+; CHECK-NEXT:    local.tee $push20=, 6, $pop21
+; CHECK-NEXT:    local.get $push28=, 2
+; CHECK-NEXT:    local.get $push27=, 4
+; CHECK-NEXT:    i64.add $push0=, $pop28, $pop27
+; CHECK-NEXT:    local.get $push30=, 6
+; CHECK-NEXT:    local.get $push29=, 1
+; CHECK-NEXT:    i64.lt_u $push1=, $pop30, $pop29
+; CHECK-NEXT:    i64.extend_i32_u $push2=, $pop1
+; CHECK-NEXT:    i64.add $push3=, $pop0, $pop2
+; CHECK-NEXT:    local.get $push32=, 3
+; CHECK-NEXT:    local.get $push31=, 4
+; CHECK-NEXT:    call __multi3, $pop16, $pop20, $pop3, $pop32, $pop31
+; CHECK-NEXT:    local.get $push37=, 5
+; CHECK-NEXT:    local.get $push33=, 5
+; CHECK-NEXT:    i64.load $push7=, 16($pop33)
+; CHECK-NEXT:    local.get $push34=, 5
+; CHECK-NEXT:    i32.const $push17=, 16
+; CHECK-NEXT:    i32.add $push18=, $pop34, $pop17
+; CHECK-NEXT:    i32.const $push4=, 8
+; CHECK-NEXT:    i32.add $push5=, $pop18, $pop4
+; CHECK-NEXT:    i64.load $push6=, 0($pop5)
+; CHECK-NEXT:    local.get $push36=, 3
+; CHECK-NEXT:    local.get $push35=, 4
+; CHECK-NEXT:    call __umodti3, $pop37, $pop7, $pop6, $pop36, $pop35
+; CHECK-NEXT:    local.get $push39=, 0
+; CHECK-NEXT:    local.get $push38=, 5
+; CHECK-NEXT:    i32.const $push19=, 8
+; CHECK-NEXT:    i32.add $push8=, $pop38, $pop19
+; CHECK-NEXT:    i64.load $push9=, 0($pop8)
+; CHECK-NEXT:    i64.store 8($pop39), $pop9
+; CHECK-NEXT:    local.get $push41=, 0
+; CHECK-NEXT:    local.get $push40=, 5
+; CHECK-NEXT:    i64.load $push10=, 0($pop40)
+; CHECK-NEXT:    i64.store 0($pop41), $pop10
+; CHECK-NEXT:    local.get $push42=, 5
+; CHECK-NEXT:    i32.const $push13=, 32
+; CHECK-NEXT:    i32.add $push14=, $pop42, $pop13
+; CHECK-NEXT:    global.set __stack_pointer, $pop14
+; CHECK-NEXT:    return
   %a = add i128 %x, %y
-  ; CHECK: call __multi3
   %b = mul i128 %a, %y
-  ; CHECK: call __umodti3
   %c = urem i128 %b, %y
   ret i128 %c
 }
 
-; CHECK-LABEL: f64libcalls:
 define i32 @f64libcalls(double %x, double %y, i32 %z) {
- ; CHECK: call $push{{[0-9]}}=, cos
+; CHECK-LABEL: f64libcalls:
+; CHECK:         .functype f64libcalls (f64, f64, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    call $push0=, cos, $pop8
+; CHECK-NEXT:    call $push1=, log10, $pop0
+; CHECK-NEXT:    local.get $push9=, 1
+; CHECK-NEXT:    call $push2=, pow, $pop1, $pop9
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    call $push3=, __powidf2, $pop2, $pop10
+; CHECK-NEXT:    call $push4=, log, $pop3
+; CHECK-NEXT:    call $push5=, exp, $pop4
+; CHECK-NEXT:    call $push6=, cbrt, $pop5
+; CHECK-NEXT:    call $push7=, lround, $pop6
+; CHECK-NEXT:    return $pop7
  %a = call double @llvm.cos.f64(double %x)
- ; CHECK: call $push{{[0-9]}}=, log10
  %b = call double @llvm.log10.f64(double %a)
- ; CHECK: call $push{{[0-9]}}=, pow
  %c = call double @llvm.pow.f64(double %b, double %y)
- ; CHECK: call $push{{[0-9]}}=, __powidf2
  %d = call double @llvm.powi.f64.i32(double %c, i32 %z)
- ; CHECK: call $push{{[0-9]}}=, log
  %e = call double @llvm.log.f64(double %d)
- ; CHECK: call $push{{[0-9]}}=, exp
  %f = call double @llvm.exp.f64(double %e)
- ; CHECK: call $push{{[0-9]}}=, cbrt
  %g = call fast double @llvm.pow.f64(double %f, double 0x3FD5555555555555)
- ; CHECK: call $push{{[0-9]}}=, lround
  %h = call i32 @llvm.lround(double %g)
  ret i32 %h
 }
@@ -84,45 +263,85 @@ define i32 @f64libcalls(double %x, double %y, i32 %z) {
 ; comment in WebAssemblyRunimeLibcallSignatures.cpp) so check them separately.
 ; no libcalls are needed for f32 and f64
 
-; CHECK-LABEL: unordd:
 define i1 @unordd(double %x, double %y) {
- ; CHECK-NOT: call
- ; CHECK: f64.ne
+; CHECK-LABEL: unordd:
+; CHECK:         .functype unordd (f64, f64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    local.get $push7=, 0
+; CHECK-NEXT:    f64.ne $push4=, $pop8, $pop7
+; CHECK-NEXT:    local.get $push10=, 1
+; CHECK-NEXT:    local.get $push9=, 1
+; CHECK-NEXT:    f64.ne $push3=, $pop10, $pop9
+; CHECK-NEXT:    i32.or $push5=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push12=, 0
+; CHECK-NEXT:    local.get $push11=, 0
+; CHECK-NEXT:    f64.eq $push1=, $pop12, $pop11
+; CHECK-NEXT:    local.get $push14=, 1
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    f64.eq $push0=, $pop14, $pop13
+; CHECK-NEXT:    i32.and $push2=, $pop1, $pop0
+; CHECK-NEXT:    i32.xor $push6=, $pop5, $pop2
+; CHECK-NEXT:    return $pop6
  %a = fcmp uno double %x, %y
- ; CHECK-NOT: call
- ; CHECK: f64.eq
  %b = fcmp ord double %x, %y
- ; CHECK: i32.xor
  %c = xor i1 %a, %b
  ret i1 %c
 }
 
-; CHECK-LABEL: unordf:
 define i1 @unordf(float %x, float %y) {
- ; CHECK-NOT: call
- ; CHECK: f32.ne
+; CHECK-LABEL: unordf:
+; CHECK:         .functype unordf (f32, f32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push8=, 0
+; CHECK-NEXT:    local.get $push7=, 0
+; CHECK-NEXT:    f32.ne $push4=, $pop8, $pop7
+; CHECK-NEXT:    local.get $push10=, 1
+; CHECK-NEXT:    local.get $push9=, 1
+; CHECK-NEXT:    f32.ne $push3=, $pop10, $pop9
+; CHECK-NEXT:    i32.or $push5=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push12=, 0
+; CHECK-NEXT:    local.get $push11=, 0
+; CHECK-NEXT:    f32.eq $push1=, $pop12, $pop11
+; CHECK-NEXT:    local.get $push14=, 1
+; CHECK-NEXT:    local.get $push13=, 1
+; CHECK-NEXT:    f32.eq $push0=, $pop14, $pop13
+; CHECK-NEXT:    i32.and $push2=, $pop1, $pop0
+; CHECK-NEXT:    i32.xor $push6=, $pop5, $pop2
+; CHECK-NEXT:    return $pop6
  %a = fcmp uno float %x, %y
- ; CHECK-NOT: call
- ; CHECK: f32.eq
  %b = fcmp ord float %x, %y
- ; CHECK: i32.xor
  %c = xor i1 %a, %b
  ret i1 %c
 }
 
-; CHECK-LABEL: unordt:
 define i1 @unordt(fp128 %x, fp128 %y) {
- ; CHECK: call $push[[CALL:[0-9]]]=, __unordtf2
- ; CHECK-NEXT: i32.const $push[[ZERO:[0-9]+]]=, 0
- ; CHECK-NEXT: i32.ne $push{{[0-9]}}=, $pop[[CALL]], $pop[[ZERO]]
+; CHECK-LABEL: unordt:
+; CHECK:         .functype unordt (i64, i64, i64, i64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    local.get $push4=, 2
+; CHECK-NEXT:    local.get $push3=, 3
+; CHECK-NEXT:    call $push1=, __unordtf2, $pop6, $pop5, $pop4, $pop3
+; CHECK-NEXT:    i32.const $push0=, 0
+; CHECK-NEXT:    i32.ne $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
  %a = fcmp uno fp128 %x, %y
  ret i1 %a
 }
 
-; CHECK-LABEL: ordt:
 define i1 @ordt(fp128 %x, fp128 %y) {
- ; CHECK: call $push[[CALL:[0-9]]]=, __unordtf2
- ; CHECK-NEXT: i32.eqz $push{{[0-9]}}=, $pop[[CALL]]
+; CHECK-LABEL: ordt:
+; CHECK:         .functype ordt (i64, i64, i64, i64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push5=, 0
+; CHECK-NEXT:    local.get $push4=, 1
+; CHECK-NEXT:    local.get $push3=, 2
+; CHECK-NEXT:    local.get $push2=, 3
+; CHECK-NEXT:    call $push0=, __unordtf2, $pop5, $pop4, $pop3, $pop2
+; CHECK-NEXT:    i32.eqz $push1=, $pop0
+; CHECK-NEXT:    return $pop1
  %a = fcmp ord fp128 %x, %y
  ret i1 %a
 }

diff  --git a/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll
index dccd727362d78..51e6c2836a20f 100644
--- a/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -mcpu=mvp -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -tail-dup-placement=0 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=mvp -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -tail-dup-placement=0 | FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 
@@ -10,60 +11,86 @@ declare void @llvm.memset.p0.i32(ptr nocapture, i8, i32, i1)
 
 ; Test that return values are optimized.
 
-; CHECK-LABEL: copy_yes:
-; CHECK:      call     $push0=, memcpy, $0, $1, $2{{$}}
-; CHECK-NEXT: return   $pop0{{$}}
 define ptr @copy_yes(ptr %dst, ptr %src, i32 %len) {
+; CHECK-LABEL: copy_yes:
+; CHECK:         .functype copy_yes (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $push0=, memcpy, $0, $1, $2
+; CHECK-NEXT:    return $pop0
   call void @llvm.memcpy.p0.p0.i32(ptr %dst, ptr %src, i32 %len, i1 false)
   ret ptr %dst
 }
-
-; CHECK-LABEL: copy_no:
-; CHECK:      call     $drop=, memcpy, $0, $1, $2{{$}}
-; CHECK-NEXT: return{{$}}
 define void @copy_no(ptr %dst, ptr %src, i32 %len) {
+; CHECK-LABEL: copy_no:
+; CHECK:         .functype copy_no (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $drop=, memcpy, $0, $1, $2
+; CHECK-NEXT:    return
   call void @llvm.memcpy.p0.p0.i32(ptr %dst, ptr %src, i32 %len, i1 false)
   ret void
 }
 
-; CHECK-LABEL: move_yes:
-; CHECK:      call     $push0=, memmove, $0, $1, $2{{$}}
-; CHECK-NEXT: return   $pop0{{$}}
 define ptr @move_yes(ptr %dst, ptr %src, i32 %len) {
+; CHECK-LABEL: move_yes:
+; CHECK:         .functype move_yes (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $push0=, memmove, $0, $1, $2
+; CHECK-NEXT:    return $pop0
   call void @llvm.memmove.p0.p0.i32(ptr %dst, ptr %src, i32 %len, i1 false)
   ret ptr %dst
 }
 
-; CHECK-LABEL: move_no:
-; CHECK:      call     $drop=, memmove, $0, $1, $2{{$}}
-; CHECK-NEXT: return{{$}}
 define void @move_no(ptr %dst, ptr %src, i32 %len) {
+; CHECK-LABEL: move_no:
+; CHECK:         .functype move_no (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $drop=, memmove, $0, $1, $2
+; CHECK-NEXT:    return
   call void @llvm.memmove.p0.p0.i32(ptr %dst, ptr %src, i32 %len, i1 false)
   ret void
 }
 
-; CHECK-LABEL: set_yes:
-; CHECK:      call     $push0=, memset, $0, $1, $2{{$}}
-; CHECK-NEXT: return   $pop0{{$}}
 define ptr @set_yes(ptr %dst, i8 %src, i32 %len) {
+; CHECK-LABEL: set_yes:
+; CHECK:         .functype set_yes (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $push0=, memset, $0, $1, $2
+; CHECK-NEXT:    return $pop0
   call void @llvm.memset.p0.i32(ptr %dst, i8 %src, i32 %len, i1 false)
   ret ptr %dst
 }
 
-; CHECK-LABEL: set_no:
-; CHECK:      call     $drop=, memset, $0, $1, $2{{$}}
-; CHECK-NEXT: return{{$}}
 define void @set_no(ptr %dst, i8 %src, i32 %len) {
+; CHECK-LABEL: set_no:
+; CHECK:         .functype set_no (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $drop=, memset, $0, $1, $2
+; CHECK-NEXT:    return
   call void @llvm.memset.p0.i32(ptr %dst, i8 %src, i32 %len, i1 false)
   ret void
 }
 
-
-; CHECK-LABEL: frame_index:
-; CHECK: call $drop=, memset, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK: call $push{{[0-9]+}}=, memset, ${{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
-; CHECK: return{{$}}
 define void @frame_index() {
+; CHECK-LABEL: frame_index:
+; CHECK:         .functype frame_index () -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-NEXT:    i32.const $push4=, 4096
+; CHECK-NEXT:    i32.sub $push12=, $pop3, $pop4
+; CHECK-NEXT:    local.tee $push11=, $0=, $pop12
+; CHECK-NEXT:    global.set __stack_pointer, $pop11
+; CHECK-NEXT:    i32.const $push7=, 2048
+; CHECK-NEXT:    i32.add $push8=, $0, $pop7
+; CHECK-NEXT:    i32.const $push1=, 0
+; CHECK-NEXT:    i32.const $push0=, 1024
+; CHECK-NEXT:    call $drop=, memset, $pop8, $pop1, $pop0
+; CHECK-NEXT:    i32.const $push10=, 0
+; CHECK-NEXT:    i32.const $push9=, 1024
+; CHECK-NEXT:    call $push2=, memset, $0, $pop10, $pop9
+; CHECK-NEXT:    i32.const $push5=, 4096
+; CHECK-NEXT:    i32.add $push6=, $pop2, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    return
 entry:
   %a = alloca [2048 x i8], align 16
   %b = alloca [2048 x i8], align 16
@@ -76,11 +103,28 @@ entry:
 ; $drop. Note that we use a call to prevent tail dup so that we can test
 ; this specific functionality.
 
-; CHECK-LABEL: drop_result:
-; CHECK: call $drop=, memset, $0, $1, $2
 declare ptr @def()
 declare void @block_tail_dup()
 define ptr @drop_result(ptr %arg, i8 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
+; CHECK-LABEL: drop_result:
+; CHECK:         .functype drop_result (i32, i32, i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %bb
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    br_if 0, $3 # 0: down to label1
+; CHECK-NEXT:  # %bb.1: # %bb5
+; CHECK-NEXT:    br_if 1, $4 # 1: down to label0
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    call $drop=, memset, $0, $1, $2
+; CHECK-NEXT:    call block_tail_dup
+; CHECK-NEXT:    return $0
+; CHECK-NEXT:  .LBB7_3: # %bb9
+; CHECK-NEXT:    end_block # label1:
+; CHECK-NEXT:    call $0=, def
+; CHECK-NEXT:  .LBB7_4: # %bb11
+; CHECK-NEXT:    end_block # label0:
+; CHECK-NEXT:    call block_tail_dup
+; CHECK-NEXT:    return $0
 bb:
   %tmp = icmp eq i32 %arg3, 0
   br i1 %tmp, label %bb5, label %bb9
@@ -109,9 +153,24 @@ bb11:
 ; This is the same as drop_result, except we let tail dup happen, so the
 ; result of the memset *is* stackified.
 
-; CHECK-LABEL: tail_dup_to_reuse_result:
-; CHECK: call $push{{[0-9]+}}=, memset, $0, $1, $2
 define ptr @tail_dup_to_reuse_result(ptr %arg, i8 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
+; CHECK-LABEL: tail_dup_to_reuse_result:
+; CHECK:         .functype tail_dup_to_reuse_result (i32, i32, i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %bb
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    br_if 0, $3 # 0: down to label3
+; CHECK-NEXT:  # %bb.1: # %bb5
+; CHECK-NEXT:    br_if 1, $4 # 1: down to label2
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    call $push0=, memset, $0, $1, $2
+; CHECK-NEXT:    return $pop0
+; CHECK-NEXT:  .LBB8_3: # %bb9
+; CHECK-NEXT:    end_block # label3:
+; CHECK-NEXT:    call $0=, def
+; CHECK-NEXT:  .LBB8_4: # %bb11
+; CHECK-NEXT:    end_block # label2:
+; CHECK-NEXT:    return $0
 bb:
   %tmp = icmp eq i32 %arg3, 0
   br i1 %tmp, label %bb5, label %bb9

diff  --git a/llvm/test/CodeGen/WebAssembly/offset-fastisel.ll b/llvm/test/CodeGen/WebAssembly/offset-fastisel.ll
index f94a6a8c313d0..b896f8fde247e 100644
--- a/llvm/test/CodeGen/WebAssembly/offset-fastisel.ll
+++ b/llvm/test/CodeGen/WebAssembly/offset-fastisel.ll
@@ -1,83 +1,112 @@
-; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel -fast-isel-abort=1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel -fast-isel-abort=1 | FileCheck %s
 
 ; TODO: Merge this with offset.ll when fast-isel matches better.
 
 target triple = "wasm32-unknown-unknown"
 
-; CHECK-LABEL: store_i8_with_variable_gep_offset:
-; CHECK: i32.add    $push[[L0:[0-9]+]]=, $0, $1{{$}}
-; CHECK: i32.const  $push[[L1:[0-9]+]]=, 0{{$}}
-; CHECK: i32.store8 0($pop[[L0]]), $pop[[L1]]{{$}}
 define void @store_i8_with_variable_gep_offset(ptr %p, i32 %idx) {
+; CHECK-LABEL: store_i8_with_variable_gep_offset:
+; CHECK:         .functype store_i8_with_variable_gep_offset (i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.add $push1=, $0, $1
+; CHECK-NEXT:    i32.const $push0=, 0
+; CHECK-NEXT:    i32.store8 0($pop1), $pop0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i8, ptr %p, i32 %idx
   store i8 0, ptr %s
   ret void
 }
 
-; CHECK-LABEL: store_i8_with_array_alloca_gep:
-; CHECK: global.get  $push[[L0:[0-9]+]]=, __stack_pointer
-; CHECK: i32.const   $push[[L1:[0-9]+]]=, 32{{$}}
-; CHECK: i32.sub     $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; CHECK: local.copy  $push[[L3:[0-9]+]]=, $pop[[L2]]
-; CHECK: i32.add     $push[[L4:[0-9]+]]=, $pop[[L3]], $0{{$}}
-; CHECK: i32.const   $push[[L5:[0-9]+]]=, 0{{$}}
-; CHECK: i32.store8  0($pop[[L4]]), $pop[[L5]]{{$}}
 define hidden void @store_i8_with_array_alloca_gep(i32 %idx) {
+; CHECK-LABEL: store_i8_with_array_alloca_gep:
+; CHECK:         .functype store_i8_with_array_alloca_gep (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-NEXT:    i32.const $push4=, 32
+; CHECK-NEXT:    i32.sub $push5=, $pop3, $pop4
+; CHECK-NEXT:    local.copy $push1=, $pop5
+; CHECK-NEXT:    i32.add $push2=, $pop1, $0
+; CHECK-NEXT:    i32.const $push0=, 0
+; CHECK-NEXT:    i32.store8 0($pop2), $pop0
+; CHECK-NEXT:    # fallthrough-return
   %A = alloca [30 x i8], align 16
   %s = getelementptr inbounds [30 x i8], ptr %A, i32 0, i32 %idx
   store i8 0, ptr %s, align 1
   ret void
 }
 
-; CHECK-LABEL: store_i32_with_unfolded_gep_offset:
-; CHECK: i32.const $push[[L0:[0-9]+]]=, 24{{$}}
-; CHECK: i32.add   $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; CHECK: i32.const $push[[L2:[0-9]+]]=, 0{{$}}
-; CHECK: i32.store 0($pop[[L1]]), $pop[[L2]]{{$}}
 define void @store_i32_with_unfolded_gep_offset(ptr %p) {
+; CHECK-LABEL: store_i32_with_unfolded_gep_offset:
+; CHECK:         .functype store_i32_with_unfolded_gep_offset (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push1=, 24
+; CHECK-NEXT:    i32.add $push2=, $0, $pop1
+; CHECK-NEXT:    i32.const $push0=, 0
+; CHECK-NEXT:    i32.store 0($pop2), $pop0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr i32, ptr %p, i32 6
   store i32 0, ptr %s
   ret void
 }
 
-; CHECK-LABEL: store_i32_with_folded_gep_offset:
-; CHECK: i32.store 24($0), $pop{{[0-9]+$}}
 define void @store_i32_with_folded_gep_offset(ptr %p) {
+; CHECK-LABEL: store_i32_with_folded_gep_offset:
+; CHECK:         .functype store_i32_with_folded_gep_offset (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, 0
+; CHECK-NEXT:    i32.store 24($0), $pop0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i32, ptr %p, i32 6
   store i32 0, ptr %s
   ret void
 }
 
-; CHECK-LABEL: load_i32_with_folded_gep_offset:
-; CHECK: i32.load  $push{{[0-9]+}}=, 24($0){{$}}
 define i32 @load_i32_with_folded_gep_offset(ptr %p) {
+; CHECK-LABEL: load_i32_with_folded_gep_offset:
+; CHECK:         .functype load_i32_with_folded_gep_offset (i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.load $push0=, 24($0)
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i32, ptr %p, i32 6
   %t = load i32, ptr %s
   ret i32 %t
 }
 
-; CHECK-LABEL: store_i64_with_unfolded_gep_offset:
-; CHECK: i32.const $push[[L0:[0-9]+]]=, 24{{$}}
-; CHECK: i32.add   $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; CHECK: i64.const $push[[L2:[0-9]+]]=, 0{{$}}
-; CHECK: i64.store 0($pop[[L1]]), $pop[[L2]]{{$}}
 define void @store_i64_with_unfolded_gep_offset(ptr %p) {
+; CHECK-LABEL: store_i64_with_unfolded_gep_offset:
+; CHECK:         .functype store_i64_with_unfolded_gep_offset (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push1=, 24
+; CHECK-NEXT:    i32.add $push2=, $0, $pop1
+; CHECK-NEXT:    i64.const $push0=, 0
+; CHECK-NEXT:    i64.store 0($pop2), $pop0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr i64, ptr %p, i32 3
   store i64 0, ptr %s
   ret void
 }
 
-; CHECK-LABEL: store_i8_with_folded_gep_offset:
-; CHECK: i32.store8 24($0), $pop{{[0-9]+$}}
 define void @store_i8_with_folded_gep_offset(ptr %p) {
+; CHECK-LABEL: store_i8_with_folded_gep_offset:
+; CHECK:         .functype store_i8_with_folded_gep_offset (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, 0
+; CHECK-NEXT:    i32.store8 24($0), $pop0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i8, ptr %p, i32 24
   store i8 0, ptr %s
   ret void
 }
 
-; CHECK-LABEL: load_i8_u_with_folded_offset:
-; CHECK: i32.load8_u $push{{[0-9]+}}=, 24($0){{$}}
 define i32 @load_i8_u_with_folded_offset(ptr %p) {
+; CHECK-LABEL: load_i8_u_with_folded_offset:
+; CHECK:         .functype load_i8_u_with_folded_offset (i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.load8_u $push2=, 24($0)
+; CHECK-NEXT:    i32.const $push0=, 255
+; CHECK-NEXT:    i32.and $push1=, $pop2, $pop0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint ptr %p to i32
   %r = add nuw i32 %q, 24
   %s = inttoptr i32 %r to ptr
@@ -87,9 +116,16 @@ define i32 @load_i8_u_with_folded_offset(ptr %p) {
 }
 
 ; TODO: this should be load8_s, need to fold sign-/zero-extend in fast-isel
-; CHECK-LABEL: load_i8_s_with_folded_offset:
-; CHECK: i32.load8_u $push{{[0-9]+}}=, 24($0){{$}}
 define i32 @load_i8_s_with_folded_offset(ptr %p) {
+; CHECK-LABEL: load_i8_s_with_folded_offset:
+; CHECK:         .functype load_i8_s_with_folded_offset (i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.load8_u $push3=, 24($0)
+; CHECK-NEXT:    i32.const $push0=, 24
+; CHECK-NEXT:    i32.shl $push1=, $pop3, $pop0
+; CHECK-NEXT:    i32.const $push4=, 24
+; CHECK-NEXT:    i32.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint ptr %p to i32
   %r = add nuw i32 %q, 24
   %s = inttoptr i32 %r to ptr

diff  --git a/llvm/test/CodeGen/WebAssembly/return-int32.ll b/llvm/test/CodeGen/WebAssembly/return-int32.ll
index 32fbc10d412f2..e6805629ec2dd 100644
--- a/llvm/test/CodeGen/WebAssembly/return-int32.ll
+++ b/llvm/test/CodeGen/WebAssembly/return-int32.ll
@@ -1,24 +1,66 @@
-; RUN: llc < %s -asm-verbose=false -wasm-keep-registers | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -wasm-keep-registers -fast-isel -fast-isel-abort=1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -wasm-keep-registers | FileCheck %s
+; RUN: llc < %s -wasm-keep-registers -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=FAST
 
 target triple = "wasm32-unknown-unknown"
 
-; CHECK-LABEL: return_i32:
-; CHECK-NEXT:  .functype return_i32 (i32) -> (i32){{$}}
-; CHECK-NEXT:  local.get  $push0=, 0
-; CHECK-NEXT:  end_function{{$}}
 define i32 @return_i32(i32 %p) {
+; CHECK-LABEL: return_i32:
+; CHECK:         .functype return_i32 (i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push0=, 0
+; CHECK-NEXT:    # fallthrough-return
+;
+; FAST-LABEL: return_i32:
+; FAST:         .functype return_i32 (i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    local.get $push0=, 0
+; FAST-NEXT:    # fallthrough-return
   ret i32 %p
 }
 
-; CHECK-LABEL: return_i32_twice:
-; CHECK:      store
-; CHECK-NEXT: i32.const $push[[L0:[^,]+]]=, 1{{$}}
-; CHECK-NEXT: return $pop[[L0]]{{$}}
-; CHECK:      store
-; CHECK-NEXT: i32.const $push{{[^,]+}}=, 3{{$}}
-; CHECK-NEXT: end_function{{$}}
 define i32 @return_i32_twice(i32 %a) {
+; CHECK-LABEL: return_i32_twice:
+; CHECK:         .functype return_i32_twice (i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    i32.eqz $push7=, $pop6
+; CHECK-NEXT:    br_if 0, $pop7 # 0: down to label0
+; CHECK-NEXT:  # %bb.1: # %true
+; CHECK-NEXT:    i32.const $push3=, 0
+; CHECK-NEXT:    i32.const $push5=, 0
+; CHECK-NEXT:    i32.store 0($pop3), $pop5
+; CHECK-NEXT:    i32.const $push4=, 1
+; CHECK-NEXT:    return $pop4
+; CHECK-NEXT:  .LBB1_2: # %false
+; CHECK-NEXT:    end_block # label0:
+; CHECK-NEXT:    i32.const $push1=, 0
+; CHECK-NEXT:    i32.const $push0=, 2
+; CHECK-NEXT:    i32.store 0($pop1), $pop0
+; CHECK-NEXT:    i32.const $push2=, 3
+; CHECK-NEXT:    # fallthrough-return
+;
+; FAST-LABEL: return_i32_twice:
+; FAST:         .functype return_i32_twice (i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    block
+; FAST-NEXT:    local.get $push6=, 0
+; FAST-NEXT:    i32.eqz $push7=, $pop6
+; FAST-NEXT:    br_if 0, $pop7 # 0: down to label0
+; FAST-NEXT:  # %bb.1: # %true
+; FAST-NEXT:    i32.const $push4=, 0
+; FAST-NEXT:    i32.const $push5=, 0
+; FAST-NEXT:    i32.store 0($pop4), $pop5
+; FAST-NEXT:    i32.const $push3=, 1
+; FAST-NEXT:    return $pop3
+; FAST-NEXT:  .LBB1_2: # %false
+; FAST-NEXT:    end_block # label0:
+; FAST-NEXT:    i32.const $push1=, 0
+; FAST-NEXT:    i32.const $push2=, 2
+; FAST-NEXT:    i32.store 0($pop1), $pop2
+; FAST-NEXT:    i32.const $push0=, 3
+; FAST-NEXT:    # fallthrough-return
   %b = icmp ne i32 %a, 0
   br i1 %b, label %true, label %false
 

diff  --git a/llvm/test/CodeGen/WebAssembly/return-void.ll b/llvm/test/CodeGen/WebAssembly/return-void.ll
index 7f2ef0916684a..bf109e3055934 100644
--- a/llvm/test/CodeGen/WebAssembly/return-void.ll
+++ b/llvm/test/CodeGen/WebAssembly/return-void.ll
@@ -1,20 +1,60 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=FAST
 
 target triple = "wasm32-unknown-unknown"
 
-; CHECK-LABEL: return_void:
-; CHECK: end_function{{$}}
 define void @return_void() {
+; CHECK-LABEL: return_void:
+; CHECK:         .functype return_void () -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    # fallthrough-return
+;
+; FAST-LABEL: return_void:
+; FAST:         .functype return_void () -> ()
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    # fallthrough-return
   ret void
 }
 
-; CHECK-LABEL: return_void_twice:
-; CHECK:      store
-; CHECK-NEXT: return{{$}}
-; CHECK:      store
-; CHECK-NEXT: end_function{{$}}
 define void @return_void_twice(i32 %a) {
+; CHECK-LABEL: return_void_twice:
+; CHECK:         .functype return_void_twice (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: down to label0
+; CHECK-NEXT:  # %bb.1: # %true
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32.store 0
+; CHECK-NEXT:    return
+; CHECK-NEXT:  .LBB1_2: # %false
+; CHECK-NEXT:    end_block # label0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.store 0
+; CHECK-NEXT:    # fallthrough-return
+;
+; FAST-LABEL: return_void_twice:
+; FAST:         .functype return_void_twice (i32) -> ()
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    block
+; FAST-NEXT:    local.get 0
+; FAST-NEXT:    i32.eqz
+; FAST-NEXT:    br_if 0 # 0: down to label0
+; FAST-NEXT:  # %bb.1: # %true
+; FAST-NEXT:    i32.const 0
+; FAST-NEXT:    i32.const 0
+; FAST-NEXT:    i32.store 0
+; FAST-NEXT:    return
+; FAST-NEXT:  .LBB1_2: # %false
+; FAST-NEXT:    end_block # label0:
+; FAST-NEXT:    i32.const 0
+; FAST-NEXT:    i32.const 1
+; FAST-NEXT:    i32.store 0
+; FAST-NEXT:    # fallthrough-return
   %b = icmp ne i32 %a, 0
   br i1 %b, label %true, label %false
 

diff  --git a/llvm/test/CodeGen/WebAssembly/returned.ll b/llvm/test/CodeGen/WebAssembly/returned.ll
index b00a670a015a8..e767e29704d54 100644
--- a/llvm/test/CodeGen/WebAssembly/returned.ll
+++ b/llvm/test/CodeGen/WebAssembly/returned.ll
@@ -1,31 +1,34 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
 
 ; Test that the "returned" attribute is optimized effectively.
 
 target triple = "wasm32-unknown-unknown"
 
-; CHECK-LABEL: _Z3foov:
-; CHECK-NEXT: .functype _Z3foov () -> (i32){{$}}
-; CHECK-NEXT: i32.const $push0=, 1{{$}}
-; CHECK-NEXT: {{^}} call      $push1=, _Znwm, $pop0{{$}}
-; CHECK-NEXT: {{^}} call      $push2=, _ZN5AppleC1Ev, $pop1{{$}}
-; CHECK-NEXT: return    $pop2{{$}}
 %class.Apple = type { i8 }
 declare noalias ptr @_Znwm(i32)
 declare ptr @_ZN5AppleC1Ev(ptr returned)
 define ptr @_Z3foov() {
+; CHECK-LABEL: _Z3foov:
+; CHECK:         .functype _Z3foov () -> (i32)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    i32.const $push0=, 1
+; CHECK-NEXT:    call $push1=, _Znwm, $pop0
+; CHECK-NEXT:    call $push2=, _ZN5AppleC1Ev, $pop1
+; CHECK-NEXT:    return $pop2
 entry:
   %call = tail call noalias ptr @_Znwm(i32 1)
   %call1 = tail call ptr @_ZN5AppleC1Ev(ptr %call)
   ret ptr %call
 }
 
-; CHECK-LABEL: _Z3barPvS_l:
-; CHECK-NEXT: .functype _Z3barPvS_l (i32, i32, i32) -> (i32){{$}}
-; CHECK-NEXT: {{^}} call     $push0=, memcpy, $0, $1, $2{{$}}
-; CHECK-NEXT: return   $pop0{{$}}
 declare ptr @memcpy(ptr returned, ptr, i32)
 define ptr @_Z3barPvS_l(ptr %p, ptr %s, i32 %n) {
+; CHECK-LABEL: _Z3barPvS_l:
+; CHECK:         .functype _Z3barPvS_l (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    call $push0=, memcpy, $0, $1, $2
+; CHECK-NEXT:    return $pop0
 entry:
   %call = tail call ptr @memcpy(ptr %p, ptr %s, i32 %n)
   ret ptr %p
@@ -33,13 +36,15 @@ entry:
 
 ; Test that the optimization isn't performed on constant arguments.
 
-; CHECK-LABEL: test_constant_arg:
-; CHECK:      i32.const   $push0=, global{{$}}
-; CHECK-NEXT: {{^}} call        $drop=, returns_arg, $pop0{{$}}
-; CHECK-NEXT: return{{$}}
 @global = external global i32
 @addr = global ptr @global
 define void @test_constant_arg() {
+; CHECK-LABEL: test_constant_arg:
+; CHECK:         .functype test_constant_arg () -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, global
+; CHECK-NEXT:    call $drop=, returns_arg, $pop0
+; CHECK-NEXT:    return
   %call = call ptr @returns_arg(ptr @global)
   ret void
 }
@@ -47,16 +52,17 @@ declare ptr @returns_arg(ptr returned)
 
 ; Test that the optimization isn't performed on arguments without the
 ; "returned" attribute.
-
-; CHECK-LABEL: test_other_skipped:
-; CHECK-NEXT: .functype test_other_skipped (i32, i32, f64) -> (){{$}}
-; CHECK-NEXT: {{^}} call     $drop=, do_something, $0, $1, $2{{$}}
-; CHECK-NEXT: {{^}} call     do_something_with_i32, $1{{$}}
-; CHECK-NEXT: {{^}} call     do_something_with_double, $2{{$}}
 declare i32 @do_something(i32 returned, i32, double)
 declare void @do_something_with_i32(i32)
 declare void @do_something_with_double(double)
 define void @test_other_skipped(i32 %a, i32 %b, double %c) {
+; CHECK-LABEL: test_other_skipped:
+; CHECK:         .functype test_other_skipped (i32, i32, f64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $drop=, do_something, $0, $1, $2
+; CHECK-NEXT:    call do_something_with_i32, $1
+; CHECK-NEXT:    call do_something_with_double, $2
+; CHECK-NEXT:    return
     %call = call i32 @do_something(i32 %a, i32 %b, double %c)
     call void @do_something_with_i32(i32 %b)
     call void @do_something_with_double(double %c)
@@ -64,13 +70,13 @@ define void @test_other_skipped(i32 %a, i32 %b, double %c) {
 }
 
 ; Test that the optimization is performed on arguments other than the first.
-
-; CHECK-LABEL: test_second_arg:
-; CHECK-NEXT: .functype test_second_arg (i32, i32) -> (i32){{$}}
-; CHECK-NEXT: {{^}} call     $push0=, do_something_else, $0, $1{{$}}
-; CHECK-NEXT: return   $pop0{{$}}
 declare i32 @do_something_else(i32, i32 returned)
 define i32 @test_second_arg(i32 %a, i32 %b) {
+; CHECK-LABEL: test_second_arg:
+; CHECK:         .functype test_second_arg (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call $push0=, do_something_else, $0, $1
+; CHECK-NEXT:    return $pop0
     %call = call i32 @do_something_else(i32 %a, i32 %b)
     ret i32 %b
 }

diff  --git a/llvm/test/CodeGen/WebAssembly/select.ll b/llvm/test/CodeGen/WebAssembly/select.ll
index 53ad565a6454d..93faf2e3dd3f7 100644
--- a/llvm/test/CodeGen/WebAssembly/select.ll
+++ b/llvm/test/CodeGen/WebAssembly/select.ll
@@ -1,218 +1,383 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefixes CHECK,SLOW
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel -fast-isel-abort=1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=FAST
 
 ; Test that wasm select instruction is selected from LLVM select instruction.
 
 target triple = "wasm32-unknown-unknown"
 
-; CHECK-LABEL: select_i32_bool:
-; CHECK-NEXT: .functype select_i32_bool (i32, i32, i32) -> (i32){{$}}
-; CHECK-NEXT: i32.select $push0=, $1, $2, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {
+; CHECK-LABEL: select_i32_bool:
+; CHECK:         .functype select_i32_bool (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.select $push0=, $1, $2, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_i32_bool:
+; FAST:         .functype select_i32_bool (i32, i32, i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.select $push0=, $1, $2, $0
+; FAST-NEXT:    return $pop0
   %cond = select i1 %a, i32 %b, i32 %c
   ret i32 %cond
 }
 
-; CHECK-LABEL: select_i32_bool_nozext:
-; CHECK-NEXT: .functype select_i32_bool_nozext (i32, i32, i32) -> (i32){{$}}
-; CHECK-NEXT: i32.const  $push0=, 1{{$}}
-; CHECK-NEXT: i32.and    $push1=, $0, $pop0{{$}}
-; CHECK-NEXT: i32.select $push2=, $1, $2, $pop1{{$}}
-; CHECK-NEXT: return     $pop2{{$}}
 define i32 @select_i32_bool_nozext(i1 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: select_i32_bool_nozext:
+; CHECK:         .functype select_i32_bool_nozext (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, 1
+; CHECK-NEXT:    i32.and $push1=, $0, $pop0
+; CHECK-NEXT:    i32.select $push2=, $1, $2, $pop1
+; CHECK-NEXT:    return $pop2
+;
+; FAST-LABEL: select_i32_bool_nozext:
+; FAST:         .functype select_i32_bool_nozext (i32, i32, i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push0=, 1
+; FAST-NEXT:    i32.and $push1=, $0, $pop0
+; FAST-NEXT:    i32.select $push2=, $1, $2, $pop1
+; FAST-NEXT:    return $pop2
   %cond = select i1 %a, i32 %b, i32 %c
   ret i32 %cond
 }
 
-; CHECK-LABEL: select_i32_eq:
-; CHECK-NEXT: .functype select_i32_eq (i32, i32, i32) -> (i32){{$}}
-; CHECK-NEXT: i32.select $push0=, $2, $1, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define i32 @select_i32_eq(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: select_i32_eq:
+; CHECK:         .functype select_i32_eq (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.select $push0=, $2, $1, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_i32_eq:
+; FAST:         .functype select_i32_eq (i32, i32, i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.select $push0=, $2, $1, $0
+; FAST-NEXT:    return $pop0
   %cmp = icmp eq i32 %a, 0
   %cond = select i1 %cmp, i32 %b, i32 %c
   ret i32 %cond
 }
 
-; CHECK-LABEL: select_i32_ne:
-; CHECK-NEXT: .functype select_i32_ne (i32, i32, i32) -> (i32){{$}}
-; CHECK-NEXT: i32.select $push0=, $1, $2, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define i32 @select_i32_ne(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: select_i32_ne:
+; CHECK:         .functype select_i32_ne (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.select $push0=, $1, $2, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_i32_ne:
+; FAST:         .functype select_i32_ne (i32, i32, i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.select $push0=, $1, $2, $0
+; FAST-NEXT:    return $pop0
   %cmp = icmp ne i32 %a, 0
   %cond = select i1 %cmp, i32 %b, i32 %c
   ret i32 %cond
 }
 
-; CHECK-LABEL: select_i64_bool:
-; CHECK-NEXT: .functype select_i64_bool (i32, i64, i64) -> (i64){{$}}
-; CHECK-NEXT: i64.select $push0=, $1, $2, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {
+; CHECK-LABEL: select_i64_bool:
+; CHECK:         .functype select_i64_bool (i32, i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i64.select $push0=, $1, $2, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_i64_bool:
+; FAST:         .functype select_i64_bool (i32, i64, i64) -> (i64)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i64.select $push0=, $1, $2, $0
+; FAST-NEXT:    return $pop0
   %cond = select i1 %a, i64 %b, i64 %c
   ret i64 %cond
 }
 
-; CHECK-LABEL: select_i64_bool_nozext:
-; CHECK-NEXT: .functype select_i64_bool_nozext (i32, i64, i64) -> (i64){{$}}
-; CHECK-NEXT: i32.const  $push0=, 1{{$}}
-; CHECK-NEXT: i32.and    $push1=, $0, $pop0{{$}}
-; CHECK-NEXT: i64.select $push2=, $1, $2, $pop1{{$}}
-; CHECK-NEXT: return     $pop2{{$}}
 define i64 @select_i64_bool_nozext(i1 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: select_i64_bool_nozext:
+; CHECK:         .functype select_i64_bool_nozext (i32, i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, 1
+; CHECK-NEXT:    i32.and $push1=, $0, $pop0
+; CHECK-NEXT:    i64.select $push2=, $1, $2, $pop1
+; CHECK-NEXT:    return $pop2
+;
+; FAST-LABEL: select_i64_bool_nozext:
+; FAST:         .functype select_i64_bool_nozext (i32, i64, i64) -> (i64)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push0=, 1
+; FAST-NEXT:    i32.and $push1=, $0, $pop0
+; FAST-NEXT:    i64.select $push2=, $1, $2, $pop1
+; FAST-NEXT:    return $pop2
   %cond = select i1 %a, i64 %b, i64 %c
   ret i64 %cond
 }
 
-; CHECK-LABEL: select_i64_eq:
-; CHECK-NEXT: .functype select_i64_eq (i32, i64, i64) -> (i64){{$}}
-; CHECK-NEXT: i64.select $push0=, $2, $1, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define i64 @select_i64_eq(i32 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: select_i64_eq:
+; CHECK:         .functype select_i64_eq (i32, i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i64.select $push0=, $2, $1, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_i64_eq:
+; FAST:         .functype select_i64_eq (i32, i64, i64) -> (i64)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i64.select $push0=, $2, $1, $0
+; FAST-NEXT:    return $pop0
   %cmp = icmp eq i32 %a, 0
   %cond = select i1 %cmp, i64 %b, i64 %c
   ret i64 %cond
 }
 
-; CHECK-LABEL: select_i64_ne:
-; CHECK-NEXT: .functype select_i64_ne (i32, i64, i64) -> (i64){{$}}
-; CHECK-NEXT: i64.select $push0=, $1, $2, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define i64 @select_i64_ne(i32 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: select_i64_ne:
+; CHECK:         .functype select_i64_ne (i32, i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i64.select $push0=, $1, $2, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_i64_ne:
+; FAST:         .functype select_i64_ne (i32, i64, i64) -> (i64)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i64.select $push0=, $1, $2, $0
+; FAST-NEXT:    return $pop0
   %cmp = icmp ne i32 %a, 0
   %cond = select i1 %cmp, i64 %b, i64 %c
   ret i64 %cond
 }
 
-; CHECK-LABEL: select_f32_bool:
-; CHECK-NEXT: .functype select_f32_bool (i32, f32, f32) -> (f32){{$}}
-; CHECK-NEXT: f32.select $push0=, $1, $2, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define float @select_f32_bool(i1 zeroext %a, float %b, float %c) {
+; CHECK-LABEL: select_f32_bool:
+; CHECK:         .functype select_f32_bool (i32, f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    f32.select $push0=, $1, $2, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_f32_bool:
+; FAST:         .functype select_f32_bool (i32, f32, f32) -> (f32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    f32.select $push0=, $1, $2, $0
+; FAST-NEXT:    return $pop0
   %cond = select i1 %a, float %b, float %c
   ret float %cond
 }
 
-; CHECK-LABEL: select_f32_bool_nozext:
-; CHECK-NEXT: .functype select_f32_bool_nozext (i32, f32, f32) -> (f32){{$}}
-; CHECK-NEXT: i32.const  $push0=, 1{{$}}
-; CHECK-NEXT: i32.and    $push1=, $0, $pop0{{$}}
-; CHECK-NEXT: f32.select $push2=, $1, $2, $pop1{{$}}
-; CHECK-NEXT: return     $pop2{{$}}
 define float @select_f32_bool_nozext(i1 %a, float %b, float %c) {
+; CHECK-LABEL: select_f32_bool_nozext:
+; CHECK:         .functype select_f32_bool_nozext (i32, f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, 1
+; CHECK-NEXT:    i32.and $push1=, $0, $pop0
+; CHECK-NEXT:    f32.select $push2=, $1, $2, $pop1
+; CHECK-NEXT:    return $pop2
+;
+; FAST-LABEL: select_f32_bool_nozext:
+; FAST:         .functype select_f32_bool_nozext (i32, f32, f32) -> (f32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push0=, 1
+; FAST-NEXT:    i32.and $push1=, $0, $pop0
+; FAST-NEXT:    f32.select $push2=, $1, $2, $pop1
+; FAST-NEXT:    return $pop2
   %cond = select i1 %a, float %b, float %c
   ret float %cond
 }
 
-; CHECK-LABEL: select_f32_eq:
-; CHECK-NEXT: .functype select_f32_eq (i32, f32, f32) -> (f32){{$}}
-; CHECK-NEXT: f32.select $push0=, $2, $1, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define float @select_f32_eq(i32 %a, float %b, float %c) {
+; CHECK-LABEL: select_f32_eq:
+; CHECK:         .functype select_f32_eq (i32, f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    f32.select $push0=, $2, $1, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_f32_eq:
+; FAST:         .functype select_f32_eq (i32, f32, f32) -> (f32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    f32.select $push0=, $2, $1, $0
+; FAST-NEXT:    return $pop0
   %cmp = icmp eq i32 %a, 0
   %cond = select i1 %cmp, float %b, float %c
   ret float %cond
 }
 
-; CHECK-LABEL: select_f32_ne:
-; CHECK-NEXT: .functype select_f32_ne (i32, f32, f32) -> (f32){{$}}
-; CHECK-NEXT: f32.select $push0=, $1, $2, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define float @select_f32_ne(i32 %a, float %b, float %c) {
+; CHECK-LABEL: select_f32_ne:
+; CHECK:         .functype select_f32_ne (i32, f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    f32.select $push0=, $1, $2, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_f32_ne:
+; FAST:         .functype select_f32_ne (i32, f32, f32) -> (f32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    f32.select $push0=, $1, $2, $0
+; FAST-NEXT:    return $pop0
   %cmp = icmp ne i32 %a, 0
   %cond = select i1 %cmp, float %b, float %c
   ret float %cond
 }
 
-; CHECK-LABEL: select_f64_bool:
-; CHECK-NEXT: .functype select_f64_bool (i32, f64, f64) -> (f64){{$}}
-; CHECK-NEXT: f64.select $push0=, $1, $2, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define double @select_f64_bool(i1 zeroext %a, double %b, double %c) {
+; CHECK-LABEL: select_f64_bool:
+; CHECK:         .functype select_f64_bool (i32, f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    f64.select $push0=, $1, $2, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_f64_bool:
+; FAST:         .functype select_f64_bool (i32, f64, f64) -> (f64)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    f64.select $push0=, $1, $2, $0
+; FAST-NEXT:    return $pop0
   %cond = select i1 %a, double %b, double %c
   ret double %cond
 }
 
-; CHECK-LABEL: select_f64_bool_nozext:
-; CHECK-NEXT: .functype select_f64_bool_nozext (i32, f64, f64) -> (f64){{$}}
-; CHECK-NEXT: i32.const  $push0=, 1{{$}}
-; CHECK-NEXT: i32.and    $push1=, $0, $pop0{{$}}
-; CHECK-NEXT: f64.select $push2=, $1, $2, $pop1{{$}}
-; CHECK-NEXT: return     $pop2{{$}}
 define double @select_f64_bool_nozext(i1 %a, double %b, double %c) {
+; CHECK-LABEL: select_f64_bool_nozext:
+; CHECK:         .functype select_f64_bool_nozext (i32, f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, 1
+; CHECK-NEXT:    i32.and $push1=, $0, $pop0
+; CHECK-NEXT:    f64.select $push2=, $1, $2, $pop1
+; CHECK-NEXT:    return $pop2
+;
+; FAST-LABEL: select_f64_bool_nozext:
+; FAST:         .functype select_f64_bool_nozext (i32, f64, f64) -> (f64)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push0=, 1
+; FAST-NEXT:    i32.and $push1=, $0, $pop0
+; FAST-NEXT:    f64.select $push2=, $1, $2, $pop1
+; FAST-NEXT:    return $pop2
   %cond = select i1 %a, double %b, double %c
   ret double %cond
 }
 
-; CHECK-LABEL: select_f64_eq:
-; CHECK-NEXT: .functype select_f64_eq (i32, f64, f64) -> (f64){{$}}
-; CHECK-NEXT: f64.select $push0=, $2, $1, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define double @select_f64_eq(i32 %a, double %b, double %c) {
+; CHECK-LABEL: select_f64_eq:
+; CHECK:         .functype select_f64_eq (i32, f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    f64.select $push0=, $2, $1, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_f64_eq:
+; FAST:         .functype select_f64_eq (i32, f64, f64) -> (f64)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    f64.select $push0=, $2, $1, $0
+; FAST-NEXT:    return $pop0
   %cmp = icmp eq i32 %a, 0
   %cond = select i1 %cmp, double %b, double %c
   ret double %cond
 }
 
-; CHECK-LABEL: select_f64_ne:
-; CHECK-NEXT: .functype select_f64_ne (i32, f64, f64) -> (f64){{$}}
-; CHECK-NEXT: f64.select $push0=, $1, $2, $0{{$}}
-; CHECK-NEXT: return     $pop0{{$}}
 define double @select_f64_ne(i32 %a, double %b, double %c) {
+; CHECK-LABEL: select_f64_ne:
+; CHECK:         .functype select_f64_ne (i32, f64, f64) -> (f64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    f64.select $push0=, $1, $2, $0
+; CHECK-NEXT:    return $pop0
+;
+; FAST-LABEL: select_f64_ne:
+; FAST:         .functype select_f64_ne (i32, f64, f64) -> (f64)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    f64.select $push0=, $1, $2, $0
+; FAST-NEXT:    return $pop0
   %cmp = icmp ne i32 %a, 0
   %cond = select i1 %cmp, double %b, double %c
   ret double %cond
 }
 
-; CHECK-LABEL: pr40805_i32:
-; CHECK-NEXT: .functype pr40805_i32 (i32, i32, i32) -> (i32){{$}}
-; SLOW-NEXT: i32.const  $push0=, 1{{$}}
-; SLOW-NEXT: i32.and    $push1=, $0, $pop0{{$}}
-; SLOW-NEXT: i32.select $push2=, $1, $2, $pop1{{$}}
-; SLOW-NEXT: return     $pop2{{$}}
 define i32 @pr40805_i32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: pr40805_i32:
+; CHECK:         .functype pr40805_i32 (i32, i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, 1
+; CHECK-NEXT:    i32.and $push1=, $0, $pop0
+; CHECK-NEXT:    i32.select $push2=, $1, $2, $pop1
+; CHECK-NEXT:    return $pop2
+;
+; FAST-LABEL: pr40805_i32:
+; FAST:         .functype pr40805_i32 (i32, i32, i32) -> (i32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push1=, 1
+; FAST-NEXT:    i32.and $push2=, $0, $pop1
+; FAST-NEXT:    i32.select $push0=, $1, $2, $pop2
+; FAST-NEXT:    return $pop0
   %a = and i32 %x, 1
   %b = icmp ne i32 %a, 0
   %c = select i1 %b, i32 %y, i32 %z
   ret i32 %c
 }
 
-; CHECK-LABEL: pr40805_i64:
-; CHECK-NEXT: .functype pr40805_i64 (i64, i64, i64) -> (i64){{$}}
-; SLOW-NEXT: i32.wrap_i64 $push0=, $0{{$}}
-; SLOW-NEXT: i32.const  $push1=, 1{{$}}
-; SLOW-NEXT: i32.and    $push2=, $pop0, $pop1{{$}}
-; SLOW-NEXT: i64.select $push3=, $1, $2, $pop2{{$}}
-; SLOW-NEXT: return     $pop3{{$}}
 define i64 @pr40805_i64(i64 %x, i64 %y, i64 %z) {
+; CHECK-LABEL: pr40805_i64:
+; CHECK:         .functype pr40805_i64 (i64, i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $0
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.and $push2=, $pop0, $pop1
+; CHECK-NEXT:    i64.select $push3=, $1, $2, $pop2
+; CHECK-NEXT:    return $pop3
+;
+; FAST-LABEL: pr40805_i64:
+; FAST:         .functype pr40805_i64 (i64, i64, i64) -> (i64)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i64.const $push5=, 1
+; FAST-NEXT:    i64.and $push6=, $0, $pop5
+; FAST-NEXT:    i64.const $push3=, 0
+; FAST-NEXT:    i64.ne $push4=, $pop6, $pop3
+; FAST-NEXT:    i32.const $push0=, 1
+; FAST-NEXT:    i32.and $push1=, $pop4, $pop0
+; FAST-NEXT:    i64.select $push2=, $1, $2, $pop1
+; FAST-NEXT:    return $pop2
   %a = and i64 %x, 1
   %b = icmp ne i64 %a, 0
   %c = select i1 %b, i64 %y, i64 %z
   ret i64 %c
 }
 
-; CHECK-LABEL: pr44012_i32:
-; CHECK-NEXT: .functype pr44012_i32 (i32, f32, f32) -> (f32){{$}}
-; SLOW-NEXT: i32.const  $push0=, 1{{$}}
-; SLOW-NEXT: i32.and    $push1=, $0, $pop0{{$}}
-; SLOW-NEXT: f32.select $push2=, $1, $2, $pop1{{$}}
-; SLOW-NEXT: return     $pop2{{$}}
 define float @pr44012_i32(i32 %x, float %y, float %z) {
+; CHECK-LABEL: pr44012_i32:
+; CHECK:         .functype pr44012_i32 (i32, f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, 1
+; CHECK-NEXT:    i32.and $push1=, $0, $pop0
+; CHECK-NEXT:    f32.select $push2=, $1, $2, $pop1
+; CHECK-NEXT:    return $pop2
+;
+; FAST-LABEL: pr44012_i32:
+; FAST:         .functype pr44012_i32 (i32, f32, f32) -> (f32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i32.const $push1=, 1
+; FAST-NEXT:    i32.and $push2=, $0, $pop1
+; FAST-NEXT:    f32.select $push0=, $1, $2, $pop2
+; FAST-NEXT:    return $pop0
   %a = and i32 %x, 1
   %b = icmp ne i32 %a, 0
   %c = select i1 %b, float %y, float %z
   ret float %c
 }
 
-; CHECK-LABEL: pr44012_i64:
-; CHECK-NEXT: .functype pr44012_i64 (i64, f32, f32) -> (f32){{$}}
-; SLOW-NEXT: i32.wrap_i64 $push0=, $0{{$}}
-; SLOW-NEXT: i32.const  $push1=, 1{{$}}
-; SLOW-NEXT: i32.and    $push2=, $pop0, $pop1{{$}}
-; SLOW-NEXT: f32.select $push3=, $1, $2, $pop2{{$}}
-; SLOW-NEXT: return     $pop3{{$}}
 define float @pr44012_i64(i64 %x, float %y, float %z) {
+; CHECK-LABEL: pr44012_i64:
+; CHECK:         .functype pr44012_i64 (i64, f32, f32) -> (f32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $0
+; CHECK-NEXT:    i32.const $push1=, 1
+; CHECK-NEXT:    i32.and $push2=, $pop0, $pop1
+; CHECK-NEXT:    f32.select $push3=, $1, $2, $pop2
+; CHECK-NEXT:    return $pop3
+;
+; FAST-LABEL: pr44012_i64:
+; FAST:         .functype pr44012_i64 (i64, f32, f32) -> (f32)
+; FAST-NEXT:  # %bb.0:
+; FAST-NEXT:    i64.const $push5=, 1
+; FAST-NEXT:    i64.and $push6=, $0, $pop5
+; FAST-NEXT:    i64.const $push3=, 0
+; FAST-NEXT:    i64.ne $push4=, $pop6, $pop3
+; FAST-NEXT:    i32.const $push0=, 1
+; FAST-NEXT:    i32.and $push1=, $pop4, $pop0
+; FAST-NEXT:    f32.select $push2=, $1, $2, $pop1
+; FAST-NEXT:    return $pop2
   %a = and i64 %x, 1
   %b = icmp ne i64 %a, 0
   %c = select i1 %b, float %y, float %z

diff  --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 78fdccc6e60bb..013482f4ac5f8 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -1,10 +1,8 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128,SIMD128-SLOW
-
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,SIMD128,SIMD128-FAST
-
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefixes CHECK,NO-SIMD128
-
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=SIMD128
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 -fast-isel | FileCheck %s --check-prefix=SIMD128-FAST
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefix=NO-SIMD128
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel | FileCheck %s --check-prefix=NO-SIMD128-FAST
 
 ; check that a non-test run (including explicit locals pass) at least finishes
 ; RUN: llc < %s -O0 -mattr=+simd128
@@ -17,87 +15,1908 @@ target triple = "wasm32-unknown-unknown"
 ; ==============================================================================
 ; 16 x i8
 ; ==============================================================================
-; CHECK-LABEL: add_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype add_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.add $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: add_v16i8:
+; SIMD128:         .functype add_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.add $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: add_v16i8:
+; SIMD128-FAST:         .functype add_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.add $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: add_v16i8:
+; NO-SIMD128:         .functype add_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.add $push0=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
+; NO-SIMD128-NEXT:    i32.add $push4=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 15
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.add $push5=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 14
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.add $push8=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 13
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.add $push11=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 11
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.add $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
+; NO-SIMD128-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.add $push20=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push24=, 9
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.add $push23=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
+; NO-SIMD128-NEXT:    i32.const $push27=, 7
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.add $push26=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-NEXT:    i32.add $push29=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.const $push33=, 5
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.add $push32=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push36=, 3
+; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-NEXT:    i32.add $push35=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: add_v16i8:
+; NO-SIMD128-FAST:         .functype add_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.add $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.add $push1=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: sub_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype sub_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: sub_v16i8:
+; SIMD128:         .functype sub_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.sub $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: sub_v16i8:
+; SIMD128-FAST:         .functype sub_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.sub $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: sub_v16i8:
+; NO-SIMD128:         .functype sub_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.sub $push0=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push4=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 15
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.sub $push5=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 14
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.sub $push8=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 13
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.sub $push11=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.sub $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 11
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.sub $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
+; NO-SIMD128-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.sub $push20=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push24=, 9
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.sub $push23=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
+; NO-SIMD128-NEXT:    i32.const $push27=, 7
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.sub $push26=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-NEXT:    i32.sub $push29=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.const $push33=, 5
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.sub $push32=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push36=, 3
+; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-NEXT:    i32.sub $push35=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: sub_v16i8:
+; NO-SIMD128-FAST:         .functype sub_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.sub $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.sub $push19=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.sub $push22=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.sub $push28=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.sub $push31=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> %x, %y
   ret <16 x i8> %a
 }
 
-; i8x16.mul is not in spec
-; CHECK-LABEL: mul_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NOT: i8x16.mul
-; SIMD128: i8x16.extract_lane_u
-; SIMD128: i32.mul
 define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: mul_v16i8:
+; SIMD128:         .functype mul_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_u $push4=, $0, 0
+; SIMD128-NEXT:    i8x16.extract_lane_u $push3=, $1, 0
+; SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
+; SIMD128-NEXT:    i8x16.splat $push6=, $pop5
+; SIMD128-NEXT:    i8x16.extract_lane_u $push1=, $0, 1
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 1
+; SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i8x16.replace_lane $push7=, $pop6, 1, $pop2
+; SIMD128-NEXT:    i8x16.extract_lane_u $push9=, $0, 2
+; SIMD128-NEXT:    i8x16.extract_lane_u $push8=, $1, 2
+; SIMD128-NEXT:    i32.mul $push10=, $pop9, $pop8
+; SIMD128-NEXT:    i8x16.replace_lane $push11=, $pop7, 2, $pop10
+; SIMD128-NEXT:    i8x16.extract_lane_u $push13=, $0, 3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push12=, $1, 3
+; SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i8x16.replace_lane $push15=, $pop11, 3, $pop14
+; SIMD128-NEXT:    i8x16.extract_lane_u $push17=, $0, 4
+; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $1, 4
+; SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
+; SIMD128-NEXT:    i8x16.replace_lane $push19=, $pop15, 4, $pop18
+; SIMD128-NEXT:    i8x16.extract_lane_u $push21=, $0, 5
+; SIMD128-NEXT:    i8x16.extract_lane_u $push20=, $1, 5
+; SIMD128-NEXT:    i32.mul $push22=, $pop21, $pop20
+; SIMD128-NEXT:    i8x16.replace_lane $push23=, $pop19, 5, $pop22
+; SIMD128-NEXT:    i8x16.extract_lane_u $push25=, $0, 6
+; SIMD128-NEXT:    i8x16.extract_lane_u $push24=, $1, 6
+; SIMD128-NEXT:    i32.mul $push26=, $pop25, $pop24
+; SIMD128-NEXT:    i8x16.replace_lane $push27=, $pop23, 6, $pop26
+; SIMD128-NEXT:    i8x16.extract_lane_u $push29=, $0, 7
+; SIMD128-NEXT:    i8x16.extract_lane_u $push28=, $1, 7
+; SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
+; SIMD128-NEXT:    i8x16.replace_lane $push31=, $pop27, 7, $pop30
+; SIMD128-NEXT:    i8x16.extract_lane_u $push33=, $0, 8
+; SIMD128-NEXT:    i8x16.extract_lane_u $push32=, $1, 8
+; SIMD128-NEXT:    i32.mul $push34=, $pop33, $pop32
+; SIMD128-NEXT:    i8x16.replace_lane $push35=, $pop31, 8, $pop34
+; SIMD128-NEXT:    i8x16.extract_lane_u $push37=, $0, 9
+; SIMD128-NEXT:    i8x16.extract_lane_u $push36=, $1, 9
+; SIMD128-NEXT:    i32.mul $push38=, $pop37, $pop36
+; SIMD128-NEXT:    i8x16.replace_lane $push39=, $pop35, 9, $pop38
+; SIMD128-NEXT:    i8x16.extract_lane_u $push41=, $0, 10
+; SIMD128-NEXT:    i8x16.extract_lane_u $push40=, $1, 10
+; SIMD128-NEXT:    i32.mul $push42=, $pop41, $pop40
+; SIMD128-NEXT:    i8x16.replace_lane $push43=, $pop39, 10, $pop42
+; SIMD128-NEXT:    i8x16.extract_lane_u $push45=, $0, 11
+; SIMD128-NEXT:    i8x16.extract_lane_u $push44=, $1, 11
+; SIMD128-NEXT:    i32.mul $push46=, $pop45, $pop44
+; SIMD128-NEXT:    i8x16.replace_lane $push47=, $pop43, 11, $pop46
+; SIMD128-NEXT:    i8x16.extract_lane_u $push49=, $0, 12
+; SIMD128-NEXT:    i8x16.extract_lane_u $push48=, $1, 12
+; SIMD128-NEXT:    i32.mul $push50=, $pop49, $pop48
+; SIMD128-NEXT:    i8x16.replace_lane $push51=, $pop47, 12, $pop50
+; SIMD128-NEXT:    i8x16.extract_lane_u $push53=, $0, 13
+; SIMD128-NEXT:    i8x16.extract_lane_u $push52=, $1, 13
+; SIMD128-NEXT:    i32.mul $push54=, $pop53, $pop52
+; SIMD128-NEXT:    i8x16.replace_lane $push55=, $pop51, 13, $pop54
+; SIMD128-NEXT:    i8x16.extract_lane_u $push57=, $0, 14
+; SIMD128-NEXT:    i8x16.extract_lane_u $push56=, $1, 14
+; SIMD128-NEXT:    i32.mul $push58=, $pop57, $pop56
+; SIMD128-NEXT:    i8x16.replace_lane $push59=, $pop55, 14, $pop58
+; SIMD128-NEXT:    i8x16.extract_lane_u $push61=, $0, 15
+; SIMD128-NEXT:    i8x16.extract_lane_u $push60=, $1, 15
+; SIMD128-NEXT:    i32.mul $push62=, $pop61, $pop60
+; SIMD128-NEXT:    i8x16.replace_lane $push63=, $pop59, 15, $pop62
+; SIMD128-NEXT:    return $pop63
+;
+; SIMD128-FAST-LABEL: mul_v16i8:
+; SIMD128-FAST:         .functype mul_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push5=, $0, 0
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push4=, $1, 0
+; SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; SIMD128-FAST-NEXT:    i8x16.splat $push7=, $pop6
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push2=, $0, 1
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push8=, $pop7, 1, $pop3
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push10=, $0, 2
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push9=, $1, 2
+; SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push12=, $pop8, 2, $pop11
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push14=, $0, 3
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push13=, $1, 3
+; SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push16=, $pop12, 3, $pop15
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push18=, $0, 4
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push17=, $1, 4
+; SIMD128-FAST-NEXT:    i32.mul $push19=, $pop18, $pop17
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push20=, $pop16, 4, $pop19
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push22=, $0, 5
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push21=, $1, 5
+; SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push24=, $pop20, 5, $pop23
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push26=, $0, 6
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push25=, $1, 6
+; SIMD128-FAST-NEXT:    i32.mul $push27=, $pop26, $pop25
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push28=, $pop24, 6, $pop27
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push30=, $0, 7
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push29=, $1, 7
+; SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push32=, $pop28, 7, $pop31
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push34=, $0, 8
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push33=, $1, 8
+; SIMD128-FAST-NEXT:    i32.mul $push35=, $pop34, $pop33
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push36=, $pop32, 8, $pop35
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push38=, $0, 9
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push37=, $1, 9
+; SIMD128-FAST-NEXT:    i32.mul $push39=, $pop38, $pop37
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push40=, $pop36, 9, $pop39
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push42=, $0, 10
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push41=, $1, 10
+; SIMD128-FAST-NEXT:    i32.mul $push43=, $pop42, $pop41
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push44=, $pop40, 10, $pop43
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push46=, $0, 11
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push45=, $1, 11
+; SIMD128-FAST-NEXT:    i32.mul $push47=, $pop46, $pop45
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push48=, $pop44, 11, $pop47
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push50=, $0, 12
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push49=, $1, 12
+; SIMD128-FAST-NEXT:    i32.mul $push51=, $pop50, $pop49
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push52=, $pop48, 12, $pop51
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push54=, $0, 13
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push53=, $1, 13
+; SIMD128-FAST-NEXT:    i32.mul $push55=, $pop54, $pop53
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push56=, $pop52, 13, $pop55
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push58=, $0, 14
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push57=, $1, 14
+; SIMD128-FAST-NEXT:    i32.mul $push59=, $pop58, $pop57
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push60=, $pop56, 14, $pop59
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push62=, $0, 15
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push61=, $1, 15
+; SIMD128-FAST-NEXT:    i32.mul $push63=, $pop62, $pop61
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push0=, $pop60, 15, $pop63
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: mul_v16i8:
+; NO-SIMD128:         .functype mul_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.mul $push0=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push4=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 15
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.mul $push5=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 14
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.mul $push8=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 13
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.mul $push11=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 11
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.mul $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
+; NO-SIMD128-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.mul $push20=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push24=, 9
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.mul $push23=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
+; NO-SIMD128-NEXT:    i32.const $push27=, 7
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.mul $push26=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-NEXT:    i32.mul $push29=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.const $push33=, 5
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push32=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push36=, 3
+; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-NEXT:    i32.mul $push35=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: mul_v16i8:
+; NO-SIMD128-FAST:         .functype mul_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.mul $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.mul $push1=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push19=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.mul $push22=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.mul $push28=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.mul $push34=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push37=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    return
   %a = mul <16 x i8> %x, %y
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: min_s_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype min_s_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.min_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @min_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: min_s_v16i8:
+; SIMD128:         .functype min_s_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.min_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_s_v16i8:
+; SIMD128-FAST:         .functype min_s_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.min_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_s_v16i8:
+; NO-SIMD128:         .functype min_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push4=, 15
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
+; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
+; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.const $push10=, 14
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
+; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
+; NO-SIMD128-NEXT:    i32.const $push16=, 13
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push22=, 12
+; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
+; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
+; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
+; NO-SIMD128-NEXT:    i32.const $push28=, 11
+; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
+; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
+; NO-SIMD128-NEXT:    i32.const $push34=, 10
+; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
+; NO-SIMD128-NEXT:    i32.lt_s $push32=, $pop31, $pop30
+; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
+; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
+; NO-SIMD128-NEXT:    i32.const $push40=, 9
+; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
+; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
+; NO-SIMD128-NEXT:    i32.lt_s $push44=, $pop43, $pop42
+; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
+; NO-SIMD128-NEXT:    i32.const $push50=, 7
+; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
+; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
+; NO-SIMD128-NEXT:    i32.lt_s $push48=, $pop47, $pop46
+; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
+; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
+; NO-SIMD128-NEXT:    i32.const $push56=, 6
+; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.lt_s $push54=, $pop53, $pop52
+; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
+; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
+; NO-SIMD128-NEXT:    i32.const $push62=, 5
+; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
+; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
+; NO-SIMD128-NEXT:    i32.lt_s $push60=, $pop59, $pop58
+; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
+; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
+; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
+; NO-SIMD128-NEXT:    i32.lt_s $push66=, $pop65, $pop64
+; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
+; NO-SIMD128-NEXT:    i32.const $push72=, 3
+; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
+; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
+; NO-SIMD128-NEXT:    i32.lt_s $push70=, $pop69, $pop68
+; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
+; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
+; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
+; NO-SIMD128-NEXT:    i32.lt_s $push76=, $pop75, $pop74
+; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
+; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
+; NO-SIMD128-NEXT:    i32.lt_s $push80=, $pop79, $pop78
+; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
+; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
+; NO-SIMD128-NEXT:    i32.lt_s $push84=, $pop83, $pop82
+; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_s_v16i8:
+; NO-SIMD128-FAST:         .functype min_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push0=, $17
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $1, $17, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $2
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push4=, $18
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $2, $18, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $3
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push8=, $19
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push46=, $pop45, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push52=, $pop51, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push58=, $pop57, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push64=, $pop63, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push70=, $pop69, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push76=, $pop75, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push82=, $pop81, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: min_u_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype min_u_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.min_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @min_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: min_u_v16i8:
+; SIMD128:         .functype min_u_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.min_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_u_v16i8:
+; SIMD128-FAST:         .functype min_u_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.min_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_u_v16i8:
+; NO-SIMD128:         .functype min_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 15
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
+; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push11=, 14
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
+; NO-SIMD128-NEXT:    i32.const $push115=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
+; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
+; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push17=, 13
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.const $push114=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
+; NO-SIMD128-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
+; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push23=, 12
+; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
+; NO-SIMD128-NEXT:    i32.const $push111=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
+; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
+; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.const $push29=, 11
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.const $push110=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
+; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push35=, 10
+; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
+; NO-SIMD128-NEXT:    i32.const $push107=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
+; NO-SIMD128-NEXT:    i32.lt_u $push33=, $pop32, $pop31
+; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
+; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
+; NO-SIMD128-NEXT:    i32.const $push41=, 9
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-NEXT:    i32.const $push106=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
+; NO-SIMD128-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
+; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
+; NO-SIMD128-NEXT:    i32.lt_u $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
+; NO-SIMD128-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-NEXT:    i32.const $push102=, 255
+; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
+; NO-SIMD128-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
+; NO-SIMD128-NEXT:    i32.lt_u $push49=, $pop48, $pop47
+; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
+; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
+; NO-SIMD128-NEXT:    i32.const $push57=, 6
+; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
+; NO-SIMD128-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
+; NO-SIMD128-NEXT:    i32.const $push99=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
+; NO-SIMD128-NEXT:    i32.lt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
+; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
+; NO-SIMD128-NEXT:    i32.const $push63=, 5
+; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
+; NO-SIMD128-NEXT:    i32.const $push98=, 255
+; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
+; NO-SIMD128-NEXT:    i32.lt_u $push61=, $pop60, $pop59
+; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
+; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
+; NO-SIMD128-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
+; NO-SIMD128-NEXT:    i32.lt_u $push67=, $pop66, $pop65
+; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
+; NO-SIMD128-NEXT:    i32.const $push73=, 3
+; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
+; NO-SIMD128-NEXT:    i32.lt_u $push71=, $pop70, $pop69
+; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
+; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
+; NO-SIMD128-NEXT:    i32.lt_u $push77=, $pop76, $pop75
+; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
+; NO-SIMD128-NEXT:    i32.lt_u $push81=, $pop80, $pop79
+; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
+; NO-SIMD128-NEXT:    i32.lt_u $push85=, $pop84, $pop83
+; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_u_v16i8:
+; NO-SIMD128-FAST:         .functype min_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
+; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
+; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push43=, $pop42, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push47=, $pop46, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push53=, $pop52, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push59=, $pop58, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push65=, $pop64, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push71=, $pop70, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push77=, $pop76, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push83=, $pop82, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: max_s_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype max_s_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.max_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @max_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: max_s_v16i8:
+; SIMD128:         .functype max_s_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.max_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_s_v16i8:
+; SIMD128-FAST:         .functype max_s_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.max_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_s_v16i8:
+; NO-SIMD128:         .functype max_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push4=, 15
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
+; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
+; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.const $push10=, 14
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
+; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
+; NO-SIMD128-NEXT:    i32.const $push16=, 13
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push22=, 12
+; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
+; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
+; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
+; NO-SIMD128-NEXT:    i32.const $push28=, 11
+; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
+; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
+; NO-SIMD128-NEXT:    i32.const $push34=, 10
+; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
+; NO-SIMD128-NEXT:    i32.gt_s $push32=, $pop31, $pop30
+; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
+; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
+; NO-SIMD128-NEXT:    i32.const $push40=, 9
+; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
+; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
+; NO-SIMD128-NEXT:    i32.gt_s $push44=, $pop43, $pop42
+; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
+; NO-SIMD128-NEXT:    i32.const $push50=, 7
+; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
+; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
+; NO-SIMD128-NEXT:    i32.gt_s $push48=, $pop47, $pop46
+; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
+; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
+; NO-SIMD128-NEXT:    i32.const $push56=, 6
+; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.gt_s $push54=, $pop53, $pop52
+; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
+; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
+; NO-SIMD128-NEXT:    i32.const $push62=, 5
+; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
+; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
+; NO-SIMD128-NEXT:    i32.gt_s $push60=, $pop59, $pop58
+; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
+; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
+; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
+; NO-SIMD128-NEXT:    i32.gt_s $push66=, $pop65, $pop64
+; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
+; NO-SIMD128-NEXT:    i32.const $push72=, 3
+; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
+; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
+; NO-SIMD128-NEXT:    i32.gt_s $push70=, $pop69, $pop68
+; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
+; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
+; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
+; NO-SIMD128-NEXT:    i32.gt_s $push76=, $pop75, $pop74
+; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
+; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
+; NO-SIMD128-NEXT:    i32.gt_s $push80=, $pop79, $pop78
+; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
+; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
+; NO-SIMD128-NEXT:    i32.gt_s $push84=, $pop83, $pop82
+; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_s_v16i8:
+; NO-SIMD128-FAST:         .functype max_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push0=, $17
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $1, $17, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $2
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push4=, $18
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $2, $18, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $3
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push8=, $19
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push46=, $pop45, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push52=, $pop51, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push58=, $pop57, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push64=, $pop63, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push70=, $pop69, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push76=, $pop75, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push82=, $pop81, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: max_u_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype max_u_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.max_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @max_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: max_u_v16i8:
+; SIMD128:         .functype max_u_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.max_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_u_v16i8:
+; SIMD128-FAST:         .functype max_u_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.max_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_u_v16i8:
+; NO-SIMD128:         .functype max_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 15
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
+; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push11=, 14
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
+; NO-SIMD128-NEXT:    i32.const $push115=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
+; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
+; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push17=, 13
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.const $push114=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
+; NO-SIMD128-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
+; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push23=, 12
+; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
+; NO-SIMD128-NEXT:    i32.const $push111=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
+; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
+; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.const $push29=, 11
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.const $push110=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
+; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push35=, 10
+; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
+; NO-SIMD128-NEXT:    i32.const $push107=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
+; NO-SIMD128-NEXT:    i32.gt_u $push33=, $pop32, $pop31
+; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
+; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
+; NO-SIMD128-NEXT:    i32.const $push41=, 9
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-NEXT:    i32.const $push106=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
+; NO-SIMD128-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
+; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
+; NO-SIMD128-NEXT:    i32.gt_u $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
+; NO-SIMD128-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-NEXT:    i32.const $push102=, 255
+; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
+; NO-SIMD128-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
+; NO-SIMD128-NEXT:    i32.gt_u $push49=, $pop48, $pop47
+; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
+; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
+; NO-SIMD128-NEXT:    i32.const $push57=, 6
+; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
+; NO-SIMD128-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
+; NO-SIMD128-NEXT:    i32.const $push99=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
+; NO-SIMD128-NEXT:    i32.gt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
+; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
+; NO-SIMD128-NEXT:    i32.const $push63=, 5
+; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
+; NO-SIMD128-NEXT:    i32.const $push98=, 255
+; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
+; NO-SIMD128-NEXT:    i32.gt_u $push61=, $pop60, $pop59
+; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
+; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
+; NO-SIMD128-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
+; NO-SIMD128-NEXT:    i32.gt_u $push67=, $pop66, $pop65
+; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
+; NO-SIMD128-NEXT:    i32.const $push73=, 3
+; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
+; NO-SIMD128-NEXT:    i32.gt_u $push71=, $pop70, $pop69
+; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
+; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
+; NO-SIMD128-NEXT:    i32.gt_u $push77=, $pop76, $pop75
+; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
+; NO-SIMD128-NEXT:    i32.gt_u $push81=, $pop80, $pop79
+; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
+; NO-SIMD128-NEXT:    i32.gt_u $push85=, $pop84, $pop83
+; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_u_v16i8:
+; NO-SIMD128-FAST:         .functype max_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
+; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
+; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push43=, $pop42, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push47=, $pop46, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push53=, $pop52, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push59=, $pop58, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push65=, $pop64, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push71=, $pop70, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push77=, $pop76, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push83=, $pop82, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: avgr_u_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype avgr_u_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.avgr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: avgr_u_v16i8:
+; SIMD128:         .functype avgr_u_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.avgr_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: avgr_u_v16i8:
+; SIMD128-FAST:         .functype avgr_u_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.avgr_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: avgr_u_v16i8:
+; NO-SIMD128:         .functype avgr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push3=, 1
+; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 254
+; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
+; NO-SIMD128-NEXT:    i32.const $push133=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
+; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
+; NO-SIMD128-NEXT:    i32.const $push8=, 14
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push132=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
+; NO-SIMD128-NEXT:    i32.const $push131=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
+; NO-SIMD128-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
+; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
+; NO-SIMD128-NEXT:    i32.const $push14=, 13
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push129=, 1
+; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
+; NO-SIMD128-NEXT:    i32.const $push128=, 254
+; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
+; NO-SIMD128-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
+; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
+; NO-SIMD128-NEXT:    i32.const $push20=, 12
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
+; NO-SIMD128-NEXT:    i32.const $push125=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
+; NO-SIMD128-NEXT:    i32.const $push124=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
+; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
+; NO-SIMD128-NEXT:    i32.const $push26=, 11
+; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
+; NO-SIMD128-NEXT:    i32.const $push122=, 254
+; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
+; NO-SIMD128-NEXT:    i32.const $push121=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
+; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
+; NO-SIMD128-NEXT:    i32.const $push32=, 10
+; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push120=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
+; NO-SIMD128-NEXT:    i32.const $push119=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
+; NO-SIMD128-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
+; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
+; NO-SIMD128-NEXT:    i32.const $push38=, 9
+; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
+; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push117=, 1
+; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
+; NO-SIMD128-NEXT:    i32.const $push116=, 254
+; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
+; NO-SIMD128-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
+; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
+; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
+; NO-SIMD128-NEXT:    i32.const $push113=, 254
+; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
+; NO-SIMD128-NEXT:    i32.const $push112=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
+; NO-SIMD128-NEXT:    i32.const $push48=, 7
+; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
+; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
+; NO-SIMD128-NEXT:    i32.const $push110=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
+; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
+; NO-SIMD128-NEXT:    i32.const $push54=, 6
+; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
+; NO-SIMD128-NEXT:    i32.const $push107=, 254
+; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
+; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
+; NO-SIMD128-NEXT:    i32.const $push60=, 5
+; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
+; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push105=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
+; NO-SIMD128-NEXT:    i32.const $push104=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
+; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
+; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
+; NO-SIMD128-NEXT:    i32.const $push101=, 254
+; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
+; NO-SIMD128-NEXT:    i32.const $push70=, 3
+; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
+; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
+; NO-SIMD128-NEXT:    i32.const $push98=, 254
+; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
+; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
+; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push96=, 1
+; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
+; NO-SIMD128-NEXT:    i32.const $push95=, 254
+; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
+; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push93=, 1
+; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
+; NO-SIMD128-NEXT:    i32.const $push92=, 254
+; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
+; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
+; NO-SIMD128-NEXT:    i32.const $push89=, 254
+; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: avgr_u_v16i8:
+; NO-SIMD128-FAST:         .functype avgr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.add $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
+; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
+; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
+; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
+; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
+; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
+; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
+; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
+; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
+; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
+; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
+; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <16 x i8> %x, %y
   %b = add nuw <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
                               i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -106,11 +1925,336 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i8> %c
 }
 
-; CHECK-LABEL: avgr_u_v16i8_wrap:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype avgr_u_v16i8_wrap (v128, v128) -> (v128){{$}}
-; SIMD128-NOT: i8x16.avgr_u
 define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: avgr_u_v16i8_wrap:
+; SIMD128:         .functype avgr_u_v16i8_wrap (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.add $push0=, $0, $1
+; SIMD128-NEXT:    v128.const $push1=, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+; SIMD128-NEXT:    i8x16.add $push2=, $pop0, $pop1
+; SIMD128-NEXT:    i32.const $push3=, 1
+; SIMD128-NEXT:    i8x16.shr_u $push4=, $pop2, $pop3
+; SIMD128-NEXT:    return $pop4
+;
+; SIMD128-FAST-LABEL: avgr_u_v16i8_wrap:
+; SIMD128-FAST:         .functype avgr_u_v16i8_wrap (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.add $push2=, $0, $1
+; SIMD128-FAST-NEXT:    v128.const $push3=, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+; SIMD128-FAST-NEXT:    i8x16.add $push1=, $pop2, $pop3
+; SIMD128-FAST-NEXT:    i32.const $push4=, 1
+; SIMD128-FAST-NEXT:    i8x16.shr_u $push0=, $pop1, $pop4
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: avgr_u_v16i8_wrap:
+; NO-SIMD128:         .functype avgr_u_v16i8_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push3=, 1
+; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 254
+; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
+; NO-SIMD128-NEXT:    i32.const $push133=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
+; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
+; NO-SIMD128-NEXT:    i32.const $push8=, 14
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push132=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
+; NO-SIMD128-NEXT:    i32.const $push131=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
+; NO-SIMD128-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
+; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
+; NO-SIMD128-NEXT:    i32.const $push14=, 13
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push129=, 1
+; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
+; NO-SIMD128-NEXT:    i32.const $push128=, 254
+; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
+; NO-SIMD128-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
+; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
+; NO-SIMD128-NEXT:    i32.const $push20=, 12
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
+; NO-SIMD128-NEXT:    i32.const $push125=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
+; NO-SIMD128-NEXT:    i32.const $push124=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
+; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
+; NO-SIMD128-NEXT:    i32.const $push26=, 11
+; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
+; NO-SIMD128-NEXT:    i32.const $push122=, 254
+; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
+; NO-SIMD128-NEXT:    i32.const $push121=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
+; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
+; NO-SIMD128-NEXT:    i32.const $push32=, 10
+; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push120=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
+; NO-SIMD128-NEXT:    i32.const $push119=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
+; NO-SIMD128-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
+; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
+; NO-SIMD128-NEXT:    i32.const $push38=, 9
+; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
+; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push117=, 1
+; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
+; NO-SIMD128-NEXT:    i32.const $push116=, 254
+; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
+; NO-SIMD128-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
+; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
+; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
+; NO-SIMD128-NEXT:    i32.const $push113=, 254
+; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
+; NO-SIMD128-NEXT:    i32.const $push112=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
+; NO-SIMD128-NEXT:    i32.const $push48=, 7
+; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
+; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
+; NO-SIMD128-NEXT:    i32.const $push110=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
+; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
+; NO-SIMD128-NEXT:    i32.const $push54=, 6
+; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
+; NO-SIMD128-NEXT:    i32.const $push107=, 254
+; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
+; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
+; NO-SIMD128-NEXT:    i32.const $push60=, 5
+; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
+; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push105=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
+; NO-SIMD128-NEXT:    i32.const $push104=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
+; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
+; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
+; NO-SIMD128-NEXT:    i32.const $push101=, 254
+; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
+; NO-SIMD128-NEXT:    i32.const $push70=, 3
+; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
+; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
+; NO-SIMD128-NEXT:    i32.const $push98=, 254
+; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
+; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
+; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push96=, 1
+; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
+; NO-SIMD128-NEXT:    i32.const $push95=, 254
+; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
+; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push93=, 1
+; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
+; NO-SIMD128-NEXT:    i32.const $push92=, 254
+; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
+; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
+; NO-SIMD128-NEXT:    i32.const $push89=, 254
+; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: avgr_u_v16i8_wrap:
+; NO-SIMD128-FAST:         .functype avgr_u_v16i8_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.add $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
+; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
+; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
+; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
+; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
+; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
+; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
+; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
+; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
+; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
+; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
+; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   %b = add <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
                           i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -119,36 +2263,606 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i8> %c
 }
 
-; CHECK-LABEL: abs_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype abs_v16i8 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.abs $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @abs_v16i8(<16 x i8> %x) {
+; SIMD128-LABEL: abs_v16i8:
+; SIMD128:         .functype abs_v16i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.abs $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: abs_v16i8:
+; SIMD128-FAST:         .functype abs_v16i8 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.abs $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: abs_v16i8:
+; NO-SIMD128:         .functype abs_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push4=, 15
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $16
+; NO-SIMD128-NEXT:    i32.const $push1=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push117=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push116=, $17=, $pop117
+; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop116
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.const $push9=, 14
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $15
+; NO-SIMD128-NEXT:    i32.const $push115=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push114=, $pop6, $pop115
+; NO-SIMD128-NEXT:    local.tee $push113=, $16=, $pop114
+; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop113
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push14=, 13
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $14
+; NO-SIMD128-NEXT:    i32.const $push112=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push111=, $pop11, $pop112
+; NO-SIMD128-NEXT:    local.tee $push110=, $16=, $pop111
+; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop110
+; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, 12
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $13
+; NO-SIMD128-NEXT:    i32.const $push109=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push108=, $pop16, $pop109
+; NO-SIMD128-NEXT:    local.tee $push107=, $16=, $pop108
+; NO-SIMD128-NEXT:    i32.xor $push17=, $13, $pop107
+; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push24=, 11
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $12
+; NO-SIMD128-NEXT:    i32.const $push106=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push105=, $pop21, $pop106
+; NO-SIMD128-NEXT:    local.tee $push104=, $16=, $pop105
+; NO-SIMD128-NEXT:    i32.xor $push22=, $12, $pop104
+; NO-SIMD128-NEXT:    i32.sub $push23=, $pop22, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
+; NO-SIMD128-NEXT:    i32.const $push29=, 10
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.extend8_s $push26=, $11
+; NO-SIMD128-NEXT:    i32.const $push103=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push102=, $pop26, $pop103
+; NO-SIMD128-NEXT:    local.tee $push101=, $16=, $pop102
+; NO-SIMD128-NEXT:    i32.xor $push27=, $11, $pop101
+; NO-SIMD128-NEXT:    i32.sub $push28=, $pop27, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 9
+; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $10
+; NO-SIMD128-NEXT:    i32.const $push100=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push99=, $pop31, $pop100
+; NO-SIMD128-NEXT:    local.tee $push98=, $16=, $pop99
+; NO-SIMD128-NEXT:    i32.xor $push32=, $10, $pop98
+; NO-SIMD128-NEXT:    i32.sub $push33=, $pop32, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $9
+; NO-SIMD128-NEXT:    i32.const $push97=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push96=, $pop36, $pop97
+; NO-SIMD128-NEXT:    local.tee $push95=, $16=, $pop96
+; NO-SIMD128-NEXT:    i32.xor $push37=, $9, $pop95
+; NO-SIMD128-NEXT:    i32.sub $push38=, $pop37, $16
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop38
+; NO-SIMD128-NEXT:    i32.const $push94=, 7
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop94
+; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
+; NO-SIMD128-NEXT:    i32.const $push93=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push92=, $pop39, $pop93
+; NO-SIMD128-NEXT:    local.tee $push91=, $16=, $pop92
+; NO-SIMD128-NEXT:    i32.xor $push40=, $8, $pop91
+; NO-SIMD128-NEXT:    i32.sub $push41=, $pop40, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop41
+; NO-SIMD128-NEXT:    i32.const $push46=, 6
+; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
+; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
+; NO-SIMD128-NEXT:    i32.const $push90=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push89=, $pop43, $pop90
+; NO-SIMD128-NEXT:    local.tee $push88=, $16=, $pop89
+; NO-SIMD128-NEXT:    i32.xor $push44=, $7, $pop88
+; NO-SIMD128-NEXT:    i32.sub $push45=, $pop44, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
+; NO-SIMD128-NEXT:    i32.const $push51=, 5
+; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $6
+; NO-SIMD128-NEXT:    i32.const $push87=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push86=, $pop48, $pop87
+; NO-SIMD128-NEXT:    local.tee $push85=, $16=, $pop86
+; NO-SIMD128-NEXT:    i32.xor $push49=, $6, $pop85
+; NO-SIMD128-NEXT:    i32.sub $push50=, $pop49, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $5
+; NO-SIMD128-NEXT:    i32.const $push84=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push83=, $pop53, $pop84
+; NO-SIMD128-NEXT:    local.tee $push82=, $16=, $pop83
+; NO-SIMD128-NEXT:    i32.xor $push54=, $5, $pop82
+; NO-SIMD128-NEXT:    i32.sub $push55=, $pop54, $16
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
+; NO-SIMD128-NEXT:    i32.const $push59=, 3
+; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $4
+; NO-SIMD128-NEXT:    i32.const $push81=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push80=, $pop56, $pop81
+; NO-SIMD128-NEXT:    local.tee $push79=, $16=, $pop80
+; NO-SIMD128-NEXT:    i32.xor $push57=, $4, $pop79
+; NO-SIMD128-NEXT:    i32.sub $push58=, $pop57, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
+; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $3
+; NO-SIMD128-NEXT:    i32.const $push78=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push77=, $pop61, $pop78
+; NO-SIMD128-NEXT:    local.tee $push76=, $16=, $pop77
+; NO-SIMD128-NEXT:    i32.xor $push62=, $3, $pop76
+; NO-SIMD128-NEXT:    i32.sub $push63=, $pop62, $16
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
+; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $2
+; NO-SIMD128-NEXT:    i32.const $push75=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push74=, $pop64, $pop75
+; NO-SIMD128-NEXT:    local.tee $push73=, $16=, $pop74
+; NO-SIMD128-NEXT:    i32.xor $push65=, $2, $pop73
+; NO-SIMD128-NEXT:    i32.sub $push66=, $pop65, $16
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
+; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $1
+; NO-SIMD128-NEXT:    i32.const $push72=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push71=, $pop67, $pop72
+; NO-SIMD128-NEXT:    local.tee $push70=, $16=, $pop71
+; NO-SIMD128-NEXT:    i32.xor $push68=, $1, $pop70
+; NO-SIMD128-NEXT:    i32.sub $push69=, $pop68, $16
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: abs_v16i8:
+; NO-SIMD128-FAST:         .functype abs_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push0=, $1
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push117=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push116=, $17=, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push4=, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push114=, $pop4, $pop115
+; NO-SIMD128-FAST-NEXT:    local.tee $push113=, $1=, $pop114
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push111=, $pop7, $pop112
+; NO-SIMD128-FAST-NEXT:    local.tee $push110=, $2=, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push108=, $pop10, $pop109
+; NO-SIMD128-FAST-NEXT:    local.tee $push107=, $3=, $pop108
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push105=, $pop15, $pop106
+; NO-SIMD128-FAST-NEXT:    local.tee $push104=, $4=, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push102=, $pop18, $pop103
+; NO-SIMD128-FAST-NEXT:    local.tee $push101=, $5=, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push99=, $pop23, $pop100
+; NO-SIMD128-FAST-NEXT:    local.tee $push98=, $6=, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push95=, $pop28, $pop96
+; NO-SIMD128-FAST-NEXT:    local.tee $push94=, $7=, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $9
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push92=, $pop32, $pop93
+; NO-SIMD128-FAST-NEXT:    local.tee $push91=, $8=, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $9, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $pop33, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push89=, $pop35, $pop90
+; NO-SIMD128-FAST-NEXT:    local.tee $push88=, $9=, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $10, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $pop36, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push86=, $pop40, $pop87
+; NO-SIMD128-FAST-NEXT:    local.tee $push85=, $10=, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $11, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.sub $push42=, $pop41, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push83=, $pop45, $pop84
+; NO-SIMD128-FAST-NEXT:    local.tee $push82=, $11=, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.sub $push47=, $pop46, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop50, $pop81
+; NO-SIMD128-FAST-NEXT:    local.tee $push79=, $12=, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $13, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.sub $push52=, $pop51, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push55=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push77=, $pop55, $pop78
+; NO-SIMD128-FAST-NEXT:    local.tee $push76=, $13=, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.xor $push56=, $14, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.sub $push57=, $pop56, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push64=, $0, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push74=, $pop60, $pop75
+; NO-SIMD128-FAST-NEXT:    local.tee $push73=, $14=, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.xor $push61=, $15, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.sub $push62=, $pop61, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop64), $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push69=, $0, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push65=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push71=, $pop65, $pop72
+; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $0=, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $16, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.sub $push67=, $pop66, $0
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop69), $pop67
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> zeroinitializer, %x
   %b = icmp slt <16 x i8> %x, zeroinitializer
   %c = select <16 x i1> %b, <16 x i8> %a, <16 x i8> %x
   ret <16 x i8> %c
 }
 
-; CHECK-LABEL: neg_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype neg_v16i8 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.neg $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @neg_v16i8(<16 x i8> %x) {
+; SIMD128-LABEL: neg_v16i8:
+; SIMD128:         .functype neg_v16i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.neg $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: neg_v16i8:
+; SIMD128-FAST:         .functype neg_v16i8 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.neg $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: neg_v16i8:
+; NO-SIMD128:         .functype neg_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $9
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push53=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop53, $5
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push52=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop52, $3
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push51=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop51, $2
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push50=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop50, $1
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push7=, 15
+; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-NEXT:    i32.const $push49=, 0
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop49, $16
+; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 14
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.const $push48=, 0
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop48, $15
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
+; NO-SIMD128-NEXT:    i32.const $push13=, 13
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.const $push47=, 0
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop47, $14
+; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.const $push16=, 12
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.const $push46=, 0
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop46, $13
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push19=, 11
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.const $push45=, 0
+; NO-SIMD128-NEXT:    i32.sub $push18=, $pop45, $12
+; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push22=, 10
+; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-NEXT:    i32.const $push44=, 0
+; NO-SIMD128-NEXT:    i32.sub $push21=, $pop44, $11
+; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
+; NO-SIMD128-NEXT:    i32.const $push25=, 9
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.const $push43=, 0
+; NO-SIMD128-NEXT:    i32.sub $push24=, $pop43, $10
+; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push28=, 7
+; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
+; NO-SIMD128-NEXT:    i32.const $push42=, 0
+; NO-SIMD128-NEXT:    i32.sub $push27=, $pop42, $8
+; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
+; NO-SIMD128-NEXT:    i32.const $push31=, 6
+; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.const $push41=, 0
+; NO-SIMD128-NEXT:    i32.sub $push30=, $pop41, $7
+; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.const $push34=, 5
+; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
+; NO-SIMD128-NEXT:    i32.const $push40=, 0
+; NO-SIMD128-NEXT:    i32.sub $push33=, $pop40, $6
+; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
+; NO-SIMD128-NEXT:    i32.const $push37=, 3
+; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
+; NO-SIMD128-NEXT:    i32.const $push39=, 0
+; NO-SIMD128-NEXT:    i32.sub $push36=, $pop39, $4
+; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: neg_v16i8:
+; NO-SIMD128-FAST:         .functype neg_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop53, $2
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop52, $3
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop51, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop50, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop49, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop48, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop47, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop46, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop45, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push23=, $pop44, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push26=, $pop43, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push29=, $pop42, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push32=, $pop41, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push35=, $pop40, $15
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push38=, $pop39, $16
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
                       i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
                      %x
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: shl_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shl_v16i8 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shl $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
+; SIMD128-LABEL: shl_v16i8:
+; SIMD128:         .functype shl_v16i8 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shl $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shl_v16i8:
+; SIMD128-FAST:         .functype shl_v16i8 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.shl $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_v16i8:
+; NO-SIMD128:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push40=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push39=, $17=, $pop40
+; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop39
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $17
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $17
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push7=, 15
+; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 14
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
+; NO-SIMD128-NEXT:    i32.const $push13=, 13
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.const $push16=, 12
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push19=, 11
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push22=, 10
+; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
+; NO-SIMD128-NEXT:    i32.const $push25=, 9
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push28=, 7
+; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
+; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
+; NO-SIMD128-NEXT:    i32.const $push31=, 6
+; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.const $push34=, 5
+; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
+; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
+; NO-SIMD128-NEXT:    i32.const $push37=, 3
+; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
+; NO-SIMD128-NEXT:    i32.shl $push36=, $4, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_v16i8:
+; NO-SIMD128-FAST:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $17=, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.shl $push17=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $10, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.shl $push23=, $11, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $12, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shl $push29=, $13, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.shl $push32=, $14, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shl $push35=, $15, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $16, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
     <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
@@ -157,47 +2871,732 @@ define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: shl_const_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shl_const_v16i8 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5
-; SIMD128-NEXT: i8x16.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
+; SIMD128-LABEL: shl_const_v16i8:
+; SIMD128:         .functype shl_const_v16i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32.const $push0=, 5
+; SIMD128-NEXT:    i8x16.shl $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: shl_const_v16i8:
+; SIMD128-FAST:         .functype shl_const_v16i8 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32.const $push1=, 5
+; SIMD128-FAST-NEXT:    i8x16.shl $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_const_v16i8:
+; NO-SIMD128:         .functype shl_const_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 5
+; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop0
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push53=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop53
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push52=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $pop52
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push51=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $pop51
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push50=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $pop50
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push7=, 15
+; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-NEXT:    i32.const $push49=, 5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $pop49
+; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 14
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.const $push48=, 5
+; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $pop48
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
+; NO-SIMD128-NEXT:    i32.const $push13=, 13
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.const $push47=, 5
+; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $pop47
+; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.const $push16=, 12
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.const $push46=, 5
+; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $pop46
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push19=, 11
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.const $push45=, 5
+; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $pop45
+; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push22=, 10
+; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-NEXT:    i32.const $push44=, 5
+; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $pop44
+; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
+; NO-SIMD128-NEXT:    i32.const $push25=, 9
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.const $push43=, 5
+; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $pop43
+; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push28=, 7
+; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
+; NO-SIMD128-NEXT:    i32.const $push42=, 5
+; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $pop42
+; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
+; NO-SIMD128-NEXT:    i32.const $push31=, 6
+; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.const $push41=, 5
+; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $pop41
+; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.const $push40=, 5
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 5
+; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $pop39
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop33
+; NO-SIMD128-NEXT:    i32.const $push36=, 3
+; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-NEXT:    i32.const $push38=, 5
+; NO-SIMD128-NEXT:    i32.shl $push35=, $4, $pop38
+; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_const_v16i8:
+; NO-SIMD128-FAST:         .functype shl_const_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $6, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $7, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $8, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $9, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push19=, $10, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push22=, $11, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push25=, $12, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push28=, $13, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push31=, $14, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $15, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push37=, $16, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v,
     <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5,
      i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: shl_vec_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shl_vec_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i32.const $push[[M0:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[M1:[0-9]+]]=, $pop[[L1]], $pop[[M0]]{{$}}
-; SIMD128-NEXT: i32.shl $push[[M2:[0-9]+]]=, $pop[[L0]], $pop[[M1]]
-; SIMD128-NEXT: i8x16.splat $push[[M3:[0-9]+]]=, $pop[[M2]]
-; Skip 14 lanes
-; SIMD128:      i8x16.extract_lane_u $push[[L4:[0-9]+]]=, $0, 15{{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[L5:[0-9]+]]=, $1, 15{{$}}
-; SIMD128-NEXT: i32.const $push[[M4:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[M5:[0-9]+]]=, $pop[[L5]], $pop[[M4]]{{$}}
-; SIMD128-NEXT: i32.shl $push[[M6:[0-9]+]]=, $pop[[L4]], $pop[[M5]]{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[M7:[0-9]+]], 15, $pop[[M6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
+; SIMD128-LABEL: shl_vec_v16i8:
+; SIMD128:         .functype shl_vec_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_u $push7=, $0, 0
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $1, 0
+; SIMD128-NEXT:    i32.const $push1=, 7
+; SIMD128-NEXT:    i32.and $push6=, $pop5, $pop1
+; SIMD128-NEXT:    i32.shl $push8=, $pop7, $pop6
+; SIMD128-NEXT:    i8x16.splat $push9=, $pop8
+; SIMD128-NEXT:    i8x16.extract_lane_u $push3=, $0, 1
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 1
+; SIMD128-NEXT:    i32.const $push95=, 7
+; SIMD128-NEXT:    i32.and $push2=, $pop0, $pop95
+; SIMD128-NEXT:    i32.shl $push4=, $pop3, $pop2
+; SIMD128-NEXT:    i8x16.replace_lane $push10=, $pop9, 1, $pop4
+; SIMD128-NEXT:    i8x16.extract_lane_u $push13=, $0, 2
+; SIMD128-NEXT:    i8x16.extract_lane_u $push11=, $1, 2
+; SIMD128-NEXT:    i32.const $push94=, 7
+; SIMD128-NEXT:    i32.and $push12=, $pop11, $pop94
+; SIMD128-NEXT:    i32.shl $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i8x16.replace_lane $push15=, $pop10, 2, $pop14
+; SIMD128-NEXT:    i8x16.extract_lane_u $push18=, $0, 3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $1, 3
+; SIMD128-NEXT:    i32.const $push93=, 7
+; SIMD128-NEXT:    i32.and $push17=, $pop16, $pop93
+; SIMD128-NEXT:    i32.shl $push19=, $pop18, $pop17
+; SIMD128-NEXT:    i8x16.replace_lane $push20=, $pop15, 3, $pop19
+; SIMD128-NEXT:    i8x16.extract_lane_u $push23=, $0, 4
+; SIMD128-NEXT:    i8x16.extract_lane_u $push21=, $1, 4
+; SIMD128-NEXT:    i32.const $push92=, 7
+; SIMD128-NEXT:    i32.and $push22=, $pop21, $pop92
+; SIMD128-NEXT:    i32.shl $push24=, $pop23, $pop22
+; SIMD128-NEXT:    i8x16.replace_lane $push25=, $pop20, 4, $pop24
+; SIMD128-NEXT:    i8x16.extract_lane_u $push28=, $0, 5
+; SIMD128-NEXT:    i8x16.extract_lane_u $push26=, $1, 5
+; SIMD128-NEXT:    i32.const $push91=, 7
+; SIMD128-NEXT:    i32.and $push27=, $pop26, $pop91
+; SIMD128-NEXT:    i32.shl $push29=, $pop28, $pop27
+; SIMD128-NEXT:    i8x16.replace_lane $push30=, $pop25, 5, $pop29
+; SIMD128-NEXT:    i8x16.extract_lane_u $push33=, $0, 6
+; SIMD128-NEXT:    i8x16.extract_lane_u $push31=, $1, 6
+; SIMD128-NEXT:    i32.const $push90=, 7
+; SIMD128-NEXT:    i32.and $push32=, $pop31, $pop90
+; SIMD128-NEXT:    i32.shl $push34=, $pop33, $pop32
+; SIMD128-NEXT:    i8x16.replace_lane $push35=, $pop30, 6, $pop34
+; SIMD128-NEXT:    i8x16.extract_lane_u $push38=, $0, 7
+; SIMD128-NEXT:    i8x16.extract_lane_u $push36=, $1, 7
+; SIMD128-NEXT:    i32.const $push89=, 7
+; SIMD128-NEXT:    i32.and $push37=, $pop36, $pop89
+; SIMD128-NEXT:    i32.shl $push39=, $pop38, $pop37
+; SIMD128-NEXT:    i8x16.replace_lane $push40=, $pop35, 7, $pop39
+; SIMD128-NEXT:    i8x16.extract_lane_u $push43=, $0, 8
+; SIMD128-NEXT:    i8x16.extract_lane_u $push41=, $1, 8
+; SIMD128-NEXT:    i32.const $push88=, 7
+; SIMD128-NEXT:    i32.and $push42=, $pop41, $pop88
+; SIMD128-NEXT:    i32.shl $push44=, $pop43, $pop42
+; SIMD128-NEXT:    i8x16.replace_lane $push45=, $pop40, 8, $pop44
+; SIMD128-NEXT:    i8x16.extract_lane_u $push48=, $0, 9
+; SIMD128-NEXT:    i8x16.extract_lane_u $push46=, $1, 9
+; SIMD128-NEXT:    i32.const $push87=, 7
+; SIMD128-NEXT:    i32.and $push47=, $pop46, $pop87
+; SIMD128-NEXT:    i32.shl $push49=, $pop48, $pop47
+; SIMD128-NEXT:    i8x16.replace_lane $push50=, $pop45, 9, $pop49
+; SIMD128-NEXT:    i8x16.extract_lane_u $push53=, $0, 10
+; SIMD128-NEXT:    i8x16.extract_lane_u $push51=, $1, 10
+; SIMD128-NEXT:    i32.const $push86=, 7
+; SIMD128-NEXT:    i32.and $push52=, $pop51, $pop86
+; SIMD128-NEXT:    i32.shl $push54=, $pop53, $pop52
+; SIMD128-NEXT:    i8x16.replace_lane $push55=, $pop50, 10, $pop54
+; SIMD128-NEXT:    i8x16.extract_lane_u $push58=, $0, 11
+; SIMD128-NEXT:    i8x16.extract_lane_u $push56=, $1, 11
+; SIMD128-NEXT:    i32.const $push85=, 7
+; SIMD128-NEXT:    i32.and $push57=, $pop56, $pop85
+; SIMD128-NEXT:    i32.shl $push59=, $pop58, $pop57
+; SIMD128-NEXT:    i8x16.replace_lane $push60=, $pop55, 11, $pop59
+; SIMD128-NEXT:    i8x16.extract_lane_u $push63=, $0, 12
+; SIMD128-NEXT:    i8x16.extract_lane_u $push61=, $1, 12
+; SIMD128-NEXT:    i32.const $push84=, 7
+; SIMD128-NEXT:    i32.and $push62=, $pop61, $pop84
+; SIMD128-NEXT:    i32.shl $push64=, $pop63, $pop62
+; SIMD128-NEXT:    i8x16.replace_lane $push65=, $pop60, 12, $pop64
+; SIMD128-NEXT:    i8x16.extract_lane_u $push68=, $0, 13
+; SIMD128-NEXT:    i8x16.extract_lane_u $push66=, $1, 13
+; SIMD128-NEXT:    i32.const $push83=, 7
+; SIMD128-NEXT:    i32.and $push67=, $pop66, $pop83
+; SIMD128-NEXT:    i32.shl $push69=, $pop68, $pop67
+; SIMD128-NEXT:    i8x16.replace_lane $push70=, $pop65, 13, $pop69
+; SIMD128-NEXT:    i8x16.extract_lane_u $push73=, $0, 14
+; SIMD128-NEXT:    i8x16.extract_lane_u $push71=, $1, 14
+; SIMD128-NEXT:    i32.const $push82=, 7
+; SIMD128-NEXT:    i32.and $push72=, $pop71, $pop82
+; SIMD128-NEXT:    i32.shl $push74=, $pop73, $pop72
+; SIMD128-NEXT:    i8x16.replace_lane $push75=, $pop70, 14, $pop74
+; SIMD128-NEXT:    i8x16.extract_lane_u $push78=, $0, 15
+; SIMD128-NEXT:    i8x16.extract_lane_u $push76=, $1, 15
+; SIMD128-NEXT:    i32.const $push81=, 7
+; SIMD128-NEXT:    i32.and $push77=, $pop76, $pop81
+; SIMD128-NEXT:    i32.shl $push79=, $pop78, $pop77
+; SIMD128-NEXT:    i8x16.replace_lane $push80=, $pop75, 15, $pop79
+; SIMD128-NEXT:    return $pop80
+;
+; SIMD128-FAST-LABEL: shl_vec_v16i8:
+; SIMD128-FAST:         .functype shl_vec_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push8=, $0, 0
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push6=, $1, 0
+; SIMD128-FAST-NEXT:    i32.const $push2=, 7
+; SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop2
+; SIMD128-FAST-NEXT:    i32.shl $push9=, $pop8, $pop7
+; SIMD128-FAST-NEXT:    i8x16.splat $push10=, $pop9
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push4=, $0, 1
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.const $push95=, 7
+; SIMD128-FAST-NEXT:    i32.and $push3=, $pop1, $pop95
+; SIMD128-FAST-NEXT:    i32.shl $push5=, $pop4, $pop3
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push11=, $pop10, 1, $pop5
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push14=, $0, 2
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push12=, $1, 2
+; SIMD128-FAST-NEXT:    i32.const $push94=, 7
+; SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $pop94
+; SIMD128-FAST-NEXT:    i32.shl $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push16=, $pop11, 2, $pop15
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push19=, $0, 3
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push17=, $1, 3
+; SIMD128-FAST-NEXT:    i32.const $push93=, 7
+; SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop93
+; SIMD128-FAST-NEXT:    i32.shl $push20=, $pop19, $pop18
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push21=, $pop16, 3, $pop20
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push24=, $0, 4
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push22=, $1, 4
+; SIMD128-FAST-NEXT:    i32.const $push92=, 7
+; SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop92
+; SIMD128-FAST-NEXT:    i32.shl $push25=, $pop24, $pop23
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push26=, $pop21, 4, $pop25
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push29=, $0, 5
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push27=, $1, 5
+; SIMD128-FAST-NEXT:    i32.const $push91=, 7
+; SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop91
+; SIMD128-FAST-NEXT:    i32.shl $push30=, $pop29, $pop28
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push31=, $pop26, 5, $pop30
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push34=, $0, 6
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push32=, $1, 6
+; SIMD128-FAST-NEXT:    i32.const $push90=, 7
+; SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $pop90
+; SIMD128-FAST-NEXT:    i32.shl $push35=, $pop34, $pop33
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push36=, $pop31, 6, $pop35
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push39=, $0, 7
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push37=, $1, 7
+; SIMD128-FAST-NEXT:    i32.const $push89=, 7
+; SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $pop89
+; SIMD128-FAST-NEXT:    i32.shl $push40=, $pop39, $pop38
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push41=, $pop36, 7, $pop40
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push44=, $0, 8
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push42=, $1, 8
+; SIMD128-FAST-NEXT:    i32.const $push88=, 7
+; SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $pop88
+; SIMD128-FAST-NEXT:    i32.shl $push45=, $pop44, $pop43
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push46=, $pop41, 8, $pop45
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push49=, $0, 9
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push47=, $1, 9
+; SIMD128-FAST-NEXT:    i32.const $push87=, 7
+; SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop87
+; SIMD128-FAST-NEXT:    i32.shl $push50=, $pop49, $pop48
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push51=, $pop46, 9, $pop50
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push54=, $0, 10
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push52=, $1, 10
+; SIMD128-FAST-NEXT:    i32.const $push86=, 7
+; SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $pop86
+; SIMD128-FAST-NEXT:    i32.shl $push55=, $pop54, $pop53
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push56=, $pop51, 10, $pop55
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push59=, $0, 11
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push57=, $1, 11
+; SIMD128-FAST-NEXT:    i32.const $push85=, 7
+; SIMD128-FAST-NEXT:    i32.and $push58=, $pop57, $pop85
+; SIMD128-FAST-NEXT:    i32.shl $push60=, $pop59, $pop58
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push61=, $pop56, 11, $pop60
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push64=, $0, 12
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push62=, $1, 12
+; SIMD128-FAST-NEXT:    i32.const $push84=, 7
+; SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $pop84
+; SIMD128-FAST-NEXT:    i32.shl $push65=, $pop64, $pop63
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push66=, $pop61, 12, $pop65
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push69=, $0, 13
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push67=, $1, 13
+; SIMD128-FAST-NEXT:    i32.const $push83=, 7
+; SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop83
+; SIMD128-FAST-NEXT:    i32.shl $push70=, $pop69, $pop68
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push71=, $pop66, 13, $pop70
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push74=, $0, 14
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push72=, $1, 14
+; SIMD128-FAST-NEXT:    i32.const $push82=, 7
+; SIMD128-FAST-NEXT:    i32.and $push73=, $pop72, $pop82
+; SIMD128-FAST-NEXT:    i32.shl $push75=, $pop74, $pop73
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push76=, $pop71, 14, $pop75
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push79=, $0, 15
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push77=, $1, 15
+; SIMD128-FAST-NEXT:    i32.const $push81=, 7
+; SIMD128-FAST-NEXT:    i32.and $push78=, $pop77, $pop81
+; SIMD128-FAST-NEXT:    i32.shl $push80=, $pop79, $pop78
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push0=, $pop76, 15, $pop80
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_vec_v16i8:
+; NO-SIMD128:         .functype shl_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $9, $pop1
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $21, $pop69
+; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $pop3
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $19, $pop68
+; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $pop5
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.shl $push8=, $2, $pop7
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $17, $pop66
+; NO-SIMD128-NEXT:    i32.shl $push10=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push13=, 15
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $32, $pop65
+; NO-SIMD128-NEXT:    i32.shl $push12=, $16, $pop11
+; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, 14
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $31, $pop64
+; NO-SIMD128-NEXT:    i32.shl $push16=, $15, $pop15
+; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push21=, 13
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $30, $pop63
+; NO-SIMD128-NEXT:    i32.shl $push20=, $14, $pop19
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push25=, 12
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $29, $pop62
+; NO-SIMD128-NEXT:    i32.shl $push24=, $13, $pop23
+; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push29=, 11
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $28, $pop61
+; NO-SIMD128-NEXT:    i32.shl $push28=, $12, $pop27
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push33=, 10
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop60
+; NO-SIMD128-NEXT:    i32.shl $push32=, $11, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push37=, 9
+; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push35=, $26, $pop59
+; NO-SIMD128-NEXT:    i32.shl $push36=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.const $push41=, 7
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push39=, $24, $pop58
+; NO-SIMD128-NEXT:    i32.shl $push40=, $8, $pop39
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-NEXT:    i32.const $push45=, 6
+; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $23, $pop57
+; NO-SIMD128-NEXT:    i32.shl $push44=, $7, $pop43
+; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
+; NO-SIMD128-NEXT:    i32.const $push49=, 5
+; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $22, $pop56
+; NO-SIMD128-NEXT:    i32.shl $push48=, $6, $pop47
+; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-NEXT:    i32.const $push53=, 3
+; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push51=, $20, $pop55
+; NO-SIMD128-NEXT:    i32.shl $push52=, $4, $pop51
+; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_vec_v16i8:
+; NO-SIMD128-FAST:         .functype shl_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $20, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $21, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $22, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $24, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $9, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $26, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shl $push30=, $10, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $27, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $11, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $28, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $12, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $29, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.shl $push42=, $13, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $30, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shl $push46=, $14, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $31, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.shl $push50=, $15, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $32, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shl $push54=, $16, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v, %x
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: shr_s_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shr_s_v16i8 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shr_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
+; SIMD128-LABEL: shr_s_v16i8:
+; SIMD128:         .functype shr_s_v16i8 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shr_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_s_v16i8:
+; SIMD128-FAST:         .functype shr_s_v16i8 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.shr_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_v16i8:
+; NO-SIMD128:         .functype shr_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $9
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push56=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push55=, $17=, $pop56
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop55
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $5
+; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $17
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $3
+; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $17
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $1
+; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push13=, 15
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $16
+; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, 14
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $15
+; NO-SIMD128-NEXT:    i32.shr_s $push16=, $pop15, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push21=, 13
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $14
+; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push25=, 12
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $13
+; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push29=, 11
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $12
+; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push33=, 10
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-NEXT:    i32.shr_s $push32=, $pop31, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push37=, 9
+; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
+; NO-SIMD128-NEXT:    i32.extend8_s $push35=, $10
+; NO-SIMD128-NEXT:    i32.shr_s $push36=, $pop35, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.const $push41=, 7
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
+; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop39, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-NEXT:    i32.const $push45=, 6
+; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
+; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
+; NO-SIMD128-NEXT:    i32.shr_s $push44=, $pop43, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
+; NO-SIMD128-NEXT:    i32.const $push49=, 5
+; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $6
+; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-NEXT:    i32.const $push53=, 3
+; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
+; NO-SIMD128-NEXT:    i32.extend8_s $push51=, $4
+; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop51, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_v16i8:
+; NO-SIMD128-FAST:         .functype shr_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push55=, $1=, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push3=, $2
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $3
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $9
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push26=, $pop25, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $10
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $11
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop33, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $12
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push38=, $pop37, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $13
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push42=, $pop41, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $14
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop45, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $16
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push54=, $pop53, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
     <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
@@ -206,34 +3605,626 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: shr_s_vec_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shr_s_vec_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.extract_lane_s $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i32.const $push[[M0:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[M1:[0-9]+]]=, $pop[[L1]], $pop[[M0]]{{$}}
-; SIMD128-NEXT: i32.shr_s $push[[M2:[0-9]+]]=, $pop[[L0]], $pop[[M1]]
-; SIMD128-NEXT: i8x16.splat $push[[M3:[0-9]+]]=, $pop[[M2]]
-; Skip 14 lanes
-; SIMD128:      i8x16.extract_lane_s $push[[L4:[0-9]+]]=, $0, 15{{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[L5:[0-9]+]]=, $1, 15{{$}}
-; SIMD128-NEXT: i32.const $push[[M4:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[M5:[0-9]+]]=, $pop[[L5]], $pop[[M4]]{{$}}
-; SIMD128-NEXT: i32.shr_s $push[[M6:[0-9]+]]=, $pop[[L4]], $pop[[M5]]{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[M7:[0-9]+]], 15, $pop[[M6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
+; SIMD128-LABEL: shr_s_vec_v16i8:
+; SIMD128:         .functype shr_s_vec_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_s $push7=, $0, 0
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $1, 0
+; SIMD128-NEXT:    i32.const $push1=, 7
+; SIMD128-NEXT:    i32.and $push6=, $pop5, $pop1
+; SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $pop6
+; SIMD128-NEXT:    i8x16.splat $push9=, $pop8
+; SIMD128-NEXT:    i8x16.extract_lane_s $push3=, $0, 1
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 1
+; SIMD128-NEXT:    i32.const $push95=, 7
+; SIMD128-NEXT:    i32.and $push2=, $pop0, $pop95
+; SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $pop2
+; SIMD128-NEXT:    i8x16.replace_lane $push10=, $pop9, 1, $pop4
+; SIMD128-NEXT:    i8x16.extract_lane_s $push13=, $0, 2
+; SIMD128-NEXT:    i8x16.extract_lane_u $push11=, $1, 2
+; SIMD128-NEXT:    i32.const $push94=, 7
+; SIMD128-NEXT:    i32.and $push12=, $pop11, $pop94
+; SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i8x16.replace_lane $push15=, $pop10, 2, $pop14
+; SIMD128-NEXT:    i8x16.extract_lane_s $push18=, $0, 3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $1, 3
+; SIMD128-NEXT:    i32.const $push93=, 7
+; SIMD128-NEXT:    i32.and $push17=, $pop16, $pop93
+; SIMD128-NEXT:    i32.shr_s $push19=, $pop18, $pop17
+; SIMD128-NEXT:    i8x16.replace_lane $push20=, $pop15, 3, $pop19
+; SIMD128-NEXT:    i8x16.extract_lane_s $push23=, $0, 4
+; SIMD128-NEXT:    i8x16.extract_lane_u $push21=, $1, 4
+; SIMD128-NEXT:    i32.const $push92=, 7
+; SIMD128-NEXT:    i32.and $push22=, $pop21, $pop92
+; SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; SIMD128-NEXT:    i8x16.replace_lane $push25=, $pop20, 4, $pop24
+; SIMD128-NEXT:    i8x16.extract_lane_s $push28=, $0, 5
+; SIMD128-NEXT:    i8x16.extract_lane_u $push26=, $1, 5
+; SIMD128-NEXT:    i32.const $push91=, 7
+; SIMD128-NEXT:    i32.and $push27=, $pop26, $pop91
+; SIMD128-NEXT:    i32.shr_s $push29=, $pop28, $pop27
+; SIMD128-NEXT:    i8x16.replace_lane $push30=, $pop25, 5, $pop29
+; SIMD128-NEXT:    i8x16.extract_lane_s $push33=, $0, 6
+; SIMD128-NEXT:    i8x16.extract_lane_u $push31=, $1, 6
+; SIMD128-NEXT:    i32.const $push90=, 7
+; SIMD128-NEXT:    i32.and $push32=, $pop31, $pop90
+; SIMD128-NEXT:    i32.shr_s $push34=, $pop33, $pop32
+; SIMD128-NEXT:    i8x16.replace_lane $push35=, $pop30, 6, $pop34
+; SIMD128-NEXT:    i8x16.extract_lane_s $push38=, $0, 7
+; SIMD128-NEXT:    i8x16.extract_lane_u $push36=, $1, 7
+; SIMD128-NEXT:    i32.const $push89=, 7
+; SIMD128-NEXT:    i32.and $push37=, $pop36, $pop89
+; SIMD128-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; SIMD128-NEXT:    i8x16.replace_lane $push40=, $pop35, 7, $pop39
+; SIMD128-NEXT:    i8x16.extract_lane_s $push43=, $0, 8
+; SIMD128-NEXT:    i8x16.extract_lane_u $push41=, $1, 8
+; SIMD128-NEXT:    i32.const $push88=, 7
+; SIMD128-NEXT:    i32.and $push42=, $pop41, $pop88
+; SIMD128-NEXT:    i32.shr_s $push44=, $pop43, $pop42
+; SIMD128-NEXT:    i8x16.replace_lane $push45=, $pop40, 8, $pop44
+; SIMD128-NEXT:    i8x16.extract_lane_s $push48=, $0, 9
+; SIMD128-NEXT:    i8x16.extract_lane_u $push46=, $1, 9
+; SIMD128-NEXT:    i32.const $push87=, 7
+; SIMD128-NEXT:    i32.and $push47=, $pop46, $pop87
+; SIMD128-NEXT:    i32.shr_s $push49=, $pop48, $pop47
+; SIMD128-NEXT:    i8x16.replace_lane $push50=, $pop45, 9, $pop49
+; SIMD128-NEXT:    i8x16.extract_lane_s $push53=, $0, 10
+; SIMD128-NEXT:    i8x16.extract_lane_u $push51=, $1, 10
+; SIMD128-NEXT:    i32.const $push86=, 7
+; SIMD128-NEXT:    i32.and $push52=, $pop51, $pop86
+; SIMD128-NEXT:    i32.shr_s $push54=, $pop53, $pop52
+; SIMD128-NEXT:    i8x16.replace_lane $push55=, $pop50, 10, $pop54
+; SIMD128-NEXT:    i8x16.extract_lane_s $push58=, $0, 11
+; SIMD128-NEXT:    i8x16.extract_lane_u $push56=, $1, 11
+; SIMD128-NEXT:    i32.const $push85=, 7
+; SIMD128-NEXT:    i32.and $push57=, $pop56, $pop85
+; SIMD128-NEXT:    i32.shr_s $push59=, $pop58, $pop57
+; SIMD128-NEXT:    i8x16.replace_lane $push60=, $pop55, 11, $pop59
+; SIMD128-NEXT:    i8x16.extract_lane_s $push63=, $0, 12
+; SIMD128-NEXT:    i8x16.extract_lane_u $push61=, $1, 12
+; SIMD128-NEXT:    i32.const $push84=, 7
+; SIMD128-NEXT:    i32.and $push62=, $pop61, $pop84
+; SIMD128-NEXT:    i32.shr_s $push64=, $pop63, $pop62
+; SIMD128-NEXT:    i8x16.replace_lane $push65=, $pop60, 12, $pop64
+; SIMD128-NEXT:    i8x16.extract_lane_s $push68=, $0, 13
+; SIMD128-NEXT:    i8x16.extract_lane_u $push66=, $1, 13
+; SIMD128-NEXT:    i32.const $push83=, 7
+; SIMD128-NEXT:    i32.and $push67=, $pop66, $pop83
+; SIMD128-NEXT:    i32.shr_s $push69=, $pop68, $pop67
+; SIMD128-NEXT:    i8x16.replace_lane $push70=, $pop65, 13, $pop69
+; SIMD128-NEXT:    i8x16.extract_lane_s $push73=, $0, 14
+; SIMD128-NEXT:    i8x16.extract_lane_u $push71=, $1, 14
+; SIMD128-NEXT:    i32.const $push82=, 7
+; SIMD128-NEXT:    i32.and $push72=, $pop71, $pop82
+; SIMD128-NEXT:    i32.shr_s $push74=, $pop73, $pop72
+; SIMD128-NEXT:    i8x16.replace_lane $push75=, $pop70, 14, $pop74
+; SIMD128-NEXT:    i8x16.extract_lane_s $push78=, $0, 15
+; SIMD128-NEXT:    i8x16.extract_lane_u $push76=, $1, 15
+; SIMD128-NEXT:    i32.const $push81=, 7
+; SIMD128-NEXT:    i32.and $push77=, $pop76, $pop81
+; SIMD128-NEXT:    i32.shr_s $push79=, $pop78, $pop77
+; SIMD128-NEXT:    i8x16.replace_lane $push80=, $pop75, 15, $pop79
+; SIMD128-NEXT:    return $pop80
+;
+; SIMD128-FAST-LABEL: shr_s_vec_v16i8:
+; SIMD128-FAST:         .functype shr_s_vec_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push8=, $0, 0
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push6=, $1, 0
+; SIMD128-FAST-NEXT:    i32.const $push2=, 7
+; SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop2
+; SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
+; SIMD128-FAST-NEXT:    i8x16.splat $push10=, $pop9
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push4=, $0, 1
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.const $push95=, 7
+; SIMD128-FAST-NEXT:    i32.and $push3=, $pop1, $pop95
+; SIMD128-FAST-NEXT:    i32.shr_s $push5=, $pop4, $pop3
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push11=, $pop10, 1, $pop5
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push14=, $0, 2
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push12=, $1, 2
+; SIMD128-FAST-NEXT:    i32.const $push94=, 7
+; SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $pop94
+; SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push16=, $pop11, 2, $pop15
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push19=, $0, 3
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push17=, $1, 3
+; SIMD128-FAST-NEXT:    i32.const $push93=, 7
+; SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop93
+; SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $pop18
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push21=, $pop16, 3, $pop20
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push24=, $0, 4
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push22=, $1, 4
+; SIMD128-FAST-NEXT:    i32.const $push92=, 7
+; SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop92
+; SIMD128-FAST-NEXT:    i32.shr_s $push25=, $pop24, $pop23
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push26=, $pop21, 4, $pop25
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push29=, $0, 5
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push27=, $1, 5
+; SIMD128-FAST-NEXT:    i32.const $push91=, 7
+; SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop91
+; SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push31=, $pop26, 5, $pop30
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push34=, $0, 6
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push32=, $1, 6
+; SIMD128-FAST-NEXT:    i32.const $push90=, 7
+; SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $pop90
+; SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop34, $pop33
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push36=, $pop31, 6, $pop35
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push39=, $0, 7
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push37=, $1, 7
+; SIMD128-FAST-NEXT:    i32.const $push89=, 7
+; SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $pop89
+; SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop39, $pop38
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push41=, $pop36, 7, $pop40
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push44=, $0, 8
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push42=, $1, 8
+; SIMD128-FAST-NEXT:    i32.const $push88=, 7
+; SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $pop88
+; SIMD128-FAST-NEXT:    i32.shr_s $push45=, $pop44, $pop43
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push46=, $pop41, 8, $pop45
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push49=, $0, 9
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push47=, $1, 9
+; SIMD128-FAST-NEXT:    i32.const $push87=, 7
+; SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop87
+; SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $pop48
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push51=, $pop46, 9, $pop50
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push54=, $0, 10
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push52=, $1, 10
+; SIMD128-FAST-NEXT:    i32.const $push86=, 7
+; SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $pop86
+; SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop54, $pop53
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push56=, $pop51, 10, $pop55
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push59=, $0, 11
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push57=, $1, 11
+; SIMD128-FAST-NEXT:    i32.const $push85=, 7
+; SIMD128-FAST-NEXT:    i32.and $push58=, $pop57, $pop85
+; SIMD128-FAST-NEXT:    i32.shr_s $push60=, $pop59, $pop58
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push61=, $pop56, 11, $pop60
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push64=, $0, 12
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push62=, $1, 12
+; SIMD128-FAST-NEXT:    i32.const $push84=, 7
+; SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $pop84
+; SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop64, $pop63
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push66=, $pop61, 12, $pop65
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push69=, $0, 13
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push67=, $1, 13
+; SIMD128-FAST-NEXT:    i32.const $push83=, 7
+; SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop83
+; SIMD128-FAST-NEXT:    i32.shr_s $push70=, $pop69, $pop68
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push71=, $pop66, 13, $pop70
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push74=, $0, 14
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push72=, $1, 14
+; SIMD128-FAST-NEXT:    i32.const $push82=, 7
+; SIMD128-FAST-NEXT:    i32.and $push73=, $pop72, $pop82
+; SIMD128-FAST-NEXT:    i32.shr_s $push75=, $pop74, $pop73
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push76=, $pop71, 14, $pop75
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_s $push79=, $0, 15
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push77=, $1, 15
+; SIMD128-FAST-NEXT:    i32.const $push81=, 7
+; SIMD128-FAST-NEXT:    i32.and $push78=, $pop77, $pop81
+; SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop79, $pop78
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push0=, $pop76, 15, $pop80
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_vec_v16i8:
+; NO-SIMD128:         .functype shr_s_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend8_s $push2=, $9
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $5
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop85
+; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $3
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $2
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop83
+; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push14=, $1
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop82
+; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push19=, 15
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $16
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop81
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push24=, 14
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $15
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop80
+; NO-SIMD128-NEXT:    i32.shr_s $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
+; NO-SIMD128-NEXT:    i32.const $push29=, 13
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $14
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop79
+; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $pop26
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 12
+; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $13
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop78
+; NO-SIMD128-NEXT:    i32.shr_s $push33=, $pop32, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
+; NO-SIMD128-NEXT:    i32.const $push39=, 11
+; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $12
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop77
+; NO-SIMD128-NEXT:    i32.shr_s $push38=, $pop37, $pop36
+; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
+; NO-SIMD128-NEXT:    i32.const $push44=, 10
+; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
+; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $11
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop76
+; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
+; NO-SIMD128-NEXT:    i32.const $push49=, 9
+; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $10
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop75
+; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $pop46
+; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $8
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop74
+; NO-SIMD128-NEXT:    i32.shr_s $push53=, $pop52, $pop51
+; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
+; NO-SIMD128-NEXT:    i32.const $push59=, 6
+; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $7
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop73
+; NO-SIMD128-NEXT:    i32.shr_s $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
+; NO-SIMD128-NEXT:    i32.const $push64=, 5
+; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-NEXT:    i32.extend8_s $push62=, $6
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop72
+; NO-SIMD128-NEXT:    i32.shr_s $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
+; NO-SIMD128-NEXT:    i32.const $push69=, 3
+; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $4
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.shr_s $push68=, $pop67, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_vec_v16i8:
+; NO-SIMD128-FAST:         .functype shr_s_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push2=, $1
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push8=, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $20, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $22, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push26=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $24, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop29), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $9
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push39=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $26, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop39, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $27, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push45=, $pop44, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $28, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push54=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $29, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push57=, $0, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push59=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $30, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push60=, $pop59, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop57), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push64=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $31, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop64, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop65
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $32, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push70=, $pop69, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop70
+; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <16 x i8> %v, %x
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: shr_u_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shr_u_v16i8 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
+; SIMD128-LABEL: shr_u_v16i8:
+; SIMD128:         .functype shr_u_v16i8 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shr_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_u_v16i8:
+; SIMD128-FAST:         .functype shr_u_v16i8 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.shr_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_v16i8:
+; NO-SIMD128:         .functype shr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $9, $pop0
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push71=, $17, $pop72
+; NO-SIMD128-NEXT:    local.tee $push70=, $17=, $pop71
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop70
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $17
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $17
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $2, $pop67
+; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push13=, 15
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $16, $pop65
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, 14
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $15, $pop64
+; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push21=, 13
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $14, $pop63
+; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push25=, 12
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $13, $pop62
+; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push29=, 11
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $12, $pop61
+; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push33=, 10
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-NEXT:    i32.shr_u $push32=, $pop31, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push37=, 9
+; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push35=, $10, $pop59
+; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.const $push41=, 7
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push39=, $8, $pop58
+; NO-SIMD128-NEXT:    i32.shr_u $push40=, $pop39, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-NEXT:    i32.const $push45=, 6
+; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $7, $pop57
+; NO-SIMD128-NEXT:    i32.shr_u $push44=, $pop43, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
+; NO-SIMD128-NEXT:    i32.const $push49=, 5
+; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $6, $pop56
+; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-NEXT:    i32.const $push53=, 3
+; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push51=, $4, $pop55
+; NO-SIMD128-NEXT:    i32.shr_u $push52=, $pop51, $17
+; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_v16i8:
+; NO-SIMD128-FAST:         .functype shr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $17, $pop72
+; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $1=, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop16), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop20), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $9, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push26=, $pop25, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $10, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push28=, $pop27, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push32=, $pop31, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $0, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $12, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $13, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push40=, $pop39, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $14, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push44=, $pop43, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $15, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $16, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push52=, $pop51, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
     <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
@@ -242,64 +4233,1022 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: shr_u_vec_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shr_u_vec_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i32.const $push[[M0:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[M1:[0-9]+]]=, $pop[[L1]], $pop[[M0]]{{$}}
-; SIMD128-NEXT: i32.shr_u $push[[M2:[0-9]+]]=, $pop[[L0]], $pop[[M1]]
-; SIMD128-NEXT: i8x16.splat $push[[M3:[0-9]+]]=, $pop[[M2]]
-; Skip 14 lanes
-; SIMD128:      i8x16.extract_lane_u $push[[L4:[0-9]+]]=, $0, 15{{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[L5:[0-9]+]]=, $1, 15{{$}}
-; SIMD128-NEXT: i32.const $push[[M4:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[M5:[0-9]+]]=, $pop[[L5]], $pop[[M4]]{{$}}
-; SIMD128-NEXT: i32.shr_u $push[[M6:[0-9]+]]=, $pop[[L4]], $pop[[M5]]{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[M7:[0-9]+]], 15, $pop[[M6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
+; SIMD128-LABEL: shr_u_vec_v16i8:
+; SIMD128:         .functype shr_u_vec_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_u $push7=, $0, 0
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $1, 0
+; SIMD128-NEXT:    i32.const $push1=, 7
+; SIMD128-NEXT:    i32.and $push6=, $pop5, $pop1
+; SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $pop6
+; SIMD128-NEXT:    i8x16.splat $push9=, $pop8
+; SIMD128-NEXT:    i8x16.extract_lane_u $push3=, $0, 1
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 1
+; SIMD128-NEXT:    i32.const $push95=, 7
+; SIMD128-NEXT:    i32.and $push2=, $pop0, $pop95
+; SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $pop2
+; SIMD128-NEXT:    i8x16.replace_lane $push10=, $pop9, 1, $pop4
+; SIMD128-NEXT:    i8x16.extract_lane_u $push13=, $0, 2
+; SIMD128-NEXT:    i8x16.extract_lane_u $push11=, $1, 2
+; SIMD128-NEXT:    i32.const $push94=, 7
+; SIMD128-NEXT:    i32.and $push12=, $pop11, $pop94
+; SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i8x16.replace_lane $push15=, $pop10, 2, $pop14
+; SIMD128-NEXT:    i8x16.extract_lane_u $push18=, $0, 3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $1, 3
+; SIMD128-NEXT:    i32.const $push93=, 7
+; SIMD128-NEXT:    i32.and $push17=, $pop16, $pop93
+; SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop17
+; SIMD128-NEXT:    i8x16.replace_lane $push20=, $pop15, 3, $pop19
+; SIMD128-NEXT:    i8x16.extract_lane_u $push23=, $0, 4
+; SIMD128-NEXT:    i8x16.extract_lane_u $push21=, $1, 4
+; SIMD128-NEXT:    i32.const $push92=, 7
+; SIMD128-NEXT:    i32.and $push22=, $pop21, $pop92
+; SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; SIMD128-NEXT:    i8x16.replace_lane $push25=, $pop20, 4, $pop24
+; SIMD128-NEXT:    i8x16.extract_lane_u $push28=, $0, 5
+; SIMD128-NEXT:    i8x16.extract_lane_u $push26=, $1, 5
+; SIMD128-NEXT:    i32.const $push91=, 7
+; SIMD128-NEXT:    i32.and $push27=, $pop26, $pop91
+; SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop27
+; SIMD128-NEXT:    i8x16.replace_lane $push30=, $pop25, 5, $pop29
+; SIMD128-NEXT:    i8x16.extract_lane_u $push33=, $0, 6
+; SIMD128-NEXT:    i8x16.extract_lane_u $push31=, $1, 6
+; SIMD128-NEXT:    i32.const $push90=, 7
+; SIMD128-NEXT:    i32.and $push32=, $pop31, $pop90
+; SIMD128-NEXT:    i32.shr_u $push34=, $pop33, $pop32
+; SIMD128-NEXT:    i8x16.replace_lane $push35=, $pop30, 6, $pop34
+; SIMD128-NEXT:    i8x16.extract_lane_u $push38=, $0, 7
+; SIMD128-NEXT:    i8x16.extract_lane_u $push36=, $1, 7
+; SIMD128-NEXT:    i32.const $push89=, 7
+; SIMD128-NEXT:    i32.and $push37=, $pop36, $pop89
+; SIMD128-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; SIMD128-NEXT:    i8x16.replace_lane $push40=, $pop35, 7, $pop39
+; SIMD128-NEXT:    i8x16.extract_lane_u $push43=, $0, 8
+; SIMD128-NEXT:    i8x16.extract_lane_u $push41=, $1, 8
+; SIMD128-NEXT:    i32.const $push88=, 7
+; SIMD128-NEXT:    i32.and $push42=, $pop41, $pop88
+; SIMD128-NEXT:    i32.shr_u $push44=, $pop43, $pop42
+; SIMD128-NEXT:    i8x16.replace_lane $push45=, $pop40, 8, $pop44
+; SIMD128-NEXT:    i8x16.extract_lane_u $push48=, $0, 9
+; SIMD128-NEXT:    i8x16.extract_lane_u $push46=, $1, 9
+; SIMD128-NEXT:    i32.const $push87=, 7
+; SIMD128-NEXT:    i32.and $push47=, $pop46, $pop87
+; SIMD128-NEXT:    i32.shr_u $push49=, $pop48, $pop47
+; SIMD128-NEXT:    i8x16.replace_lane $push50=, $pop45, 9, $pop49
+; SIMD128-NEXT:    i8x16.extract_lane_u $push53=, $0, 10
+; SIMD128-NEXT:    i8x16.extract_lane_u $push51=, $1, 10
+; SIMD128-NEXT:    i32.const $push86=, 7
+; SIMD128-NEXT:    i32.and $push52=, $pop51, $pop86
+; SIMD128-NEXT:    i32.shr_u $push54=, $pop53, $pop52
+; SIMD128-NEXT:    i8x16.replace_lane $push55=, $pop50, 10, $pop54
+; SIMD128-NEXT:    i8x16.extract_lane_u $push58=, $0, 11
+; SIMD128-NEXT:    i8x16.extract_lane_u $push56=, $1, 11
+; SIMD128-NEXT:    i32.const $push85=, 7
+; SIMD128-NEXT:    i32.and $push57=, $pop56, $pop85
+; SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop57
+; SIMD128-NEXT:    i8x16.replace_lane $push60=, $pop55, 11, $pop59
+; SIMD128-NEXT:    i8x16.extract_lane_u $push63=, $0, 12
+; SIMD128-NEXT:    i8x16.extract_lane_u $push61=, $1, 12
+; SIMD128-NEXT:    i32.const $push84=, 7
+; SIMD128-NEXT:    i32.and $push62=, $pop61, $pop84
+; SIMD128-NEXT:    i32.shr_u $push64=, $pop63, $pop62
+; SIMD128-NEXT:    i8x16.replace_lane $push65=, $pop60, 12, $pop64
+; SIMD128-NEXT:    i8x16.extract_lane_u $push68=, $0, 13
+; SIMD128-NEXT:    i8x16.extract_lane_u $push66=, $1, 13
+; SIMD128-NEXT:    i32.const $push83=, 7
+; SIMD128-NEXT:    i32.and $push67=, $pop66, $pop83
+; SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop67
+; SIMD128-NEXT:    i8x16.replace_lane $push70=, $pop65, 13, $pop69
+; SIMD128-NEXT:    i8x16.extract_lane_u $push73=, $0, 14
+; SIMD128-NEXT:    i8x16.extract_lane_u $push71=, $1, 14
+; SIMD128-NEXT:    i32.const $push82=, 7
+; SIMD128-NEXT:    i32.and $push72=, $pop71, $pop82
+; SIMD128-NEXT:    i32.shr_u $push74=, $pop73, $pop72
+; SIMD128-NEXT:    i8x16.replace_lane $push75=, $pop70, 14, $pop74
+; SIMD128-NEXT:    i8x16.extract_lane_u $push78=, $0, 15
+; SIMD128-NEXT:    i8x16.extract_lane_u $push76=, $1, 15
+; SIMD128-NEXT:    i32.const $push81=, 7
+; SIMD128-NEXT:    i32.and $push77=, $pop76, $pop81
+; SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop77
+; SIMD128-NEXT:    i8x16.replace_lane $push80=, $pop75, 15, $pop79
+; SIMD128-NEXT:    return $pop80
+;
+; SIMD128-FAST-LABEL: shr_u_vec_v16i8:
+; SIMD128-FAST:         .functype shr_u_vec_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push8=, $0, 0
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push6=, $1, 0
+; SIMD128-FAST-NEXT:    i32.const $push2=, 7
+; SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop2
+; SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; SIMD128-FAST-NEXT:    i8x16.splat $push10=, $pop9
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push4=, $0, 1
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.const $push95=, 7
+; SIMD128-FAST-NEXT:    i32.and $push3=, $pop1, $pop95
+; SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop3
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push11=, $pop10, 1, $pop5
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push14=, $0, 2
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push12=, $1, 2
+; SIMD128-FAST-NEXT:    i32.const $push94=, 7
+; SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $pop94
+; SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push16=, $pop11, 2, $pop15
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push19=, $0, 3
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push17=, $1, 3
+; SIMD128-FAST-NEXT:    i32.const $push93=, 7
+; SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop93
+; SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push21=, $pop16, 3, $pop20
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push24=, $0, 4
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push22=, $1, 4
+; SIMD128-FAST-NEXT:    i32.const $push92=, 7
+; SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop92
+; SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push26=, $pop21, 4, $pop25
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push29=, $0, 5
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push27=, $1, 5
+; SIMD128-FAST-NEXT:    i32.const $push91=, 7
+; SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop91
+; SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push31=, $pop26, 5, $pop30
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push34=, $0, 6
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push32=, $1, 6
+; SIMD128-FAST-NEXT:    i32.const $push90=, 7
+; SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $pop90
+; SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop33
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push36=, $pop31, 6, $pop35
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push39=, $0, 7
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push37=, $1, 7
+; SIMD128-FAST-NEXT:    i32.const $push89=, 7
+; SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $pop89
+; SIMD128-FAST-NEXT:    i32.shr_u $push40=, $pop39, $pop38
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push41=, $pop36, 7, $pop40
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push44=, $0, 8
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push42=, $1, 8
+; SIMD128-FAST-NEXT:    i32.const $push88=, 7
+; SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $pop88
+; SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop43
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push46=, $pop41, 8, $pop45
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push49=, $0, 9
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push47=, $1, 9
+; SIMD128-FAST-NEXT:    i32.const $push87=, 7
+; SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop87
+; SIMD128-FAST-NEXT:    i32.shr_u $push50=, $pop49, $pop48
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push51=, $pop46, 9, $pop50
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push54=, $0, 10
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push52=, $1, 10
+; SIMD128-FAST-NEXT:    i32.const $push86=, 7
+; SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $pop86
+; SIMD128-FAST-NEXT:    i32.shr_u $push55=, $pop54, $pop53
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push56=, $pop51, 10, $pop55
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push59=, $0, 11
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push57=, $1, 11
+; SIMD128-FAST-NEXT:    i32.const $push85=, 7
+; SIMD128-FAST-NEXT:    i32.and $push58=, $pop57, $pop85
+; SIMD128-FAST-NEXT:    i32.shr_u $push60=, $pop59, $pop58
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push61=, $pop56, 11, $pop60
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push64=, $0, 12
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push62=, $1, 12
+; SIMD128-FAST-NEXT:    i32.const $push84=, 7
+; SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $pop84
+; SIMD128-FAST-NEXT:    i32.shr_u $push65=, $pop64, $pop63
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push66=, $pop61, 12, $pop65
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push69=, $0, 13
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push67=, $1, 13
+; SIMD128-FAST-NEXT:    i32.const $push83=, 7
+; SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop83
+; SIMD128-FAST-NEXT:    i32.shr_u $push70=, $pop69, $pop68
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push71=, $pop66, 13, $pop70
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push74=, $0, 14
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push72=, $1, 14
+; SIMD128-FAST-NEXT:    i32.const $push82=, 7
+; SIMD128-FAST-NEXT:    i32.and $push73=, $pop72, $pop82
+; SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop73
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push76=, $pop71, 14, $pop75
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push79=, $0, 15
+; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push77=, $1, 15
+; SIMD128-FAST-NEXT:    i32.const $push81=, 7
+; SIMD128-FAST-NEXT:    i32.and $push78=, $pop77, $pop81
+; SIMD128-FAST-NEXT:    i32.shr_u $push80=, $pop79, $pop78
+; SIMD128-FAST-NEXT:    i8x16.replace_lane $push0=, $pop76, 15, $pop80
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_vec_v16i8:
+; NO-SIMD128:         .functype shr_u_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop0
+; NO-SIMD128-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop101
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $5, $pop100
+; NO-SIMD128-NEXT:    i32.const $push99=, 255
+; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop99
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push98=, 255
+; NO-SIMD128-NEXT:    i32.and $push8=, $3, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop97
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $2, $pop96
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop95
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $1, $pop94
+; NO-SIMD128-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop93
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push19=, 15
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $16, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop91
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push24=, 14
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $15, $pop90
+; NO-SIMD128-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop89
+; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
+; NO-SIMD128-NEXT:    i32.const $push29=, 13
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $14, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop87
+; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $pop26
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 12
+; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
+; NO-SIMD128-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $13, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop85
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
+; NO-SIMD128-NEXT:    i32.const $push39=, 11
+; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $12, $pop84
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop83
+; NO-SIMD128-NEXT:    i32.shr_u $push38=, $pop37, $pop36
+; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
+; NO-SIMD128-NEXT:    i32.const $push44=, 10
+; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $11, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop81
+; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
+; NO-SIMD128-NEXT:    i32.const $push49=, 9
+; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $10, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop79
+; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push52=, $8, $pop78
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop77
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop51
+; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
+; NO-SIMD128-NEXT:    i32.const $push59=, 6
+; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $7, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop75
+; NO-SIMD128-NEXT:    i32.shr_u $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
+; NO-SIMD128-NEXT:    i32.const $push64=, 5
+; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $6, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop73
+; NO-SIMD128-NEXT:    i32.shr_u $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
+; NO-SIMD128-NEXT:    i32.const $push69=, 3
+; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push67=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.shr_u $push68=, $pop67, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_vec_v16i8:
+; NO-SIMD128-FAST:         .functype shr_u_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $26, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push38=, $pop37, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push45=, $0, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push43=, $pop42, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop45), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $12, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $28, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $13, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $29, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $14, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $30, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push58=, $pop57, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop60), $pop58
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $15, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $31, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push67=, $16, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push66=, $32, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push68=, $pop67, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <16 x i8> %v, %x
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: and_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype and_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: and_v16i8:
+; SIMD128:         .functype and_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: and_v16i8:
+; SIMD128-FAST:         .functype and_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: and_v16i8:
+; NO-SIMD128:         .functype and_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.and $push0=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
+; NO-SIMD128-NEXT:    i32.and $push4=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 15
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.and $push5=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 14
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.and $push8=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 13
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.and $push11=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.and $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 11
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
+; NO-SIMD128-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push20=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push24=, 9
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.and $push23=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
+; NO-SIMD128-NEXT:    i32.const $push27=, 7
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.and $push26=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-NEXT:    i32.and $push29=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.const $push33=, 5
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.and $push32=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push36=, 3
+; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-NEXT:    i32.and $push35=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: and_v16i8:
+; NO-SIMD128-FAST:         .functype and_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    return
   %a = and <16 x i8> %x, %y
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: or_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype or_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: or_v16i8:
+; SIMD128:         .functype or_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.or $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: or_v16i8:
+; SIMD128-FAST:         .functype or_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.or $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: or_v16i8:
+; NO-SIMD128:         .functype or_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.or $push0=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
+; NO-SIMD128-NEXT:    i32.or $push4=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 15
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.or $push5=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 14
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.or $push8=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 13
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.or $push11=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.or $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 11
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.or $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
+; NO-SIMD128-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.or $push20=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push24=, 9
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.or $push23=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
+; NO-SIMD128-NEXT:    i32.const $push27=, 7
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.or $push26=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-NEXT:    i32.or $push29=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.const $push33=, 5
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.or $push32=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push36=, 3
+; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-NEXT:    i32.or $push35=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: or_v16i8:
+; NO-SIMD128-FAST:         .functype or_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.or $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.or $push1=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.or $push19=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.or $push25=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.or $push31=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.or $push34=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.or $push37=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    return
   %a = or <16 x i8> %x, %y
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: xor_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype xor_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.xor $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: xor_v16i8:
+; SIMD128:         .functype xor_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.xor $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: xor_v16i8:
+; SIMD128-FAST:         .functype xor_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: xor_v16i8:
+; NO-SIMD128:         .functype xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.xor $push0=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 15
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.xor $push5=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 14
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 13
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.xor $push11=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 11
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.xor $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
+; NO-SIMD128-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push20=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push24=, 9
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.xor $push23=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
+; NO-SIMD128-NEXT:    i32.const $push27=, 7
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.xor $push26=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-NEXT:    i32.xor $push29=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.const $push33=, 5
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.xor $push32=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push36=, 3
+; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-NEXT:    i32.xor $push35=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: xor_v16i8:
+; NO-SIMD128-FAST:         .functype xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, %y
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: not_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype not_v16i8 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @not_v16i8(<16 x i8> %x) {
+; SIMD128-LABEL: not_v16i8:
+; SIMD128:         .functype not_v16i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.not $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: not_v16i8:
+; SIMD128-FAST:         .functype not_v16i8 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: not_v16i8:
+; NO-SIMD128:         .functype not_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-NEXT:    i32.xor $push1=, $9, $pop0
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push53=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $5, $pop53
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push52=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $3, $pop52
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push51=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $2, $pop51
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push50=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $1, $pop50
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push7=, 15
+; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-NEXT:    i32.const $push49=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $16, $pop49
+; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 14
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.const $push48=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $15, $pop48
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
+; NO-SIMD128-NEXT:    i32.const $push13=, 13
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop47
+; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.const $push16=, 12
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $pop46
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push19=, 11
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop45
+; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push22=, 10
+; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $11, $pop44
+; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
+; NO-SIMD128-NEXT:    i32.const $push25=, 9
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-NEXT:    i32.xor $push24=, $10, $pop43
+; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push28=, 7
+; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
+; NO-SIMD128-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-NEXT:    i32.xor $push27=, $8, $pop42
+; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
+; NO-SIMD128-NEXT:    i32.const $push31=, 6
+; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $7, $pop41
+; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.const $push34=, 5
+; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
+; NO-SIMD128-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-NEXT:    i32.xor $push33=, $6, $pop40
+; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
+; NO-SIMD128-NEXT:    i32.const $push37=, 3
+; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push36=, $4, $pop39
+; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: not_v16i8:
+; NO-SIMD128-FAST:         .functype not_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $9, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $10, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $11, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $12, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $13, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $14, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $15, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $16, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, <i8 -1, i8 -1, i8 -1, i8 -1,
                           i8 -1, i8 -1, i8 -1, i8 -1,
                           i8 -1, i8 -1, i8 -1, i8 -1,
@@ -307,15 +5256,201 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: andnot_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype andnot_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.andnot $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: return
 define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: andnot_v16i8:
+; SIMD128:         .functype andnot_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.andnot $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: andnot_v16i8:
+; SIMD128-FAST:         .functype andnot_v16i8 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push0=, $1
+; SIMD128-FAST-NEXT:    v128.and $push1=, $0, $pop0
+; SIMD128-FAST-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: andnot_v16i8:
+; NO-SIMD128:         .functype andnot_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-NEXT:    i32.xor $push1=, $25, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop1
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $21, $pop69
+; NO-SIMD128-NEXT:    i32.and $push4=, $5, $pop3
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $19, $pop68
+; NO-SIMD128-NEXT:    i32.and $push6=, $3, $pop5
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop7
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $17, $pop66
+; NO-SIMD128-NEXT:    i32.and $push10=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push13=, 15
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $32, $pop65
+; NO-SIMD128-NEXT:    i32.and $push12=, $16, $pop11
+; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, 14
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.const $push64=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $31, $pop64
+; NO-SIMD128-NEXT:    i32.and $push16=, $15, $pop15
+; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push21=, 13
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.const $push63=, -1
+; NO-SIMD128-NEXT:    i32.xor $push19=, $30, $pop63
+; NO-SIMD128-NEXT:    i32.and $push20=, $14, $pop19
+; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push25=, 12
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.const $push62=, -1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $29, $pop62
+; NO-SIMD128-NEXT:    i32.and $push24=, $13, $pop23
+; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push29=, 11
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.const $push61=, -1
+; NO-SIMD128-NEXT:    i32.xor $push27=, $28, $pop61
+; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop27
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push33=, 10
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.const $push60=, -1
+; NO-SIMD128-NEXT:    i32.xor $push31=, $27, $pop60
+; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.const $push37=, 9
+; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
+; NO-SIMD128-NEXT:    i32.const $push59=, -1
+; NO-SIMD128-NEXT:    i32.xor $push35=, $26, $pop59
+; NO-SIMD128-NEXT:    i32.and $push36=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.const $push41=, 7
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-NEXT:    i32.const $push58=, -1
+; NO-SIMD128-NEXT:    i32.xor $push39=, $24, $pop58
+; NO-SIMD128-NEXT:    i32.and $push40=, $8, $pop39
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-NEXT:    i32.const $push45=, 6
+; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
+; NO-SIMD128-NEXT:    i32.const $push57=, -1
+; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $pop57
+; NO-SIMD128-NEXT:    i32.and $push44=, $7, $pop43
+; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
+; NO-SIMD128-NEXT:    i32.const $push49=, 5
+; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-NEXT:    i32.const $push56=, -1
+; NO-SIMD128-NEXT:    i32.xor $push47=, $22, $pop56
+; NO-SIMD128-NEXT:    i32.and $push48=, $6, $pop47
+; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-NEXT:    i32.const $push53=, 3
+; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
+; NO-SIMD128-NEXT:    i32.const $push55=, -1
+; NO-SIMD128-NEXT:    i32.xor $push51=, $20, $pop55
+; NO-SIMD128-NEXT:    i32.and $push52=, $4, $pop51
+; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: andnot_v16i8:
+; NO-SIMD128-FAST:         .functype andnot_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $21, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $23, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $24, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $25, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $26, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $10, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $27, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $11, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $28, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $12, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $29, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $13, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $30, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $14, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $31, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $15, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push53=, $32, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $16, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <16 x i8> %y,
    <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
     i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -323,17 +5458,267 @@ define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
  ret <16 x i8> %a
 }
 
-; CHECK-LABEL: bitselect_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_v16i8 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.or
-; SIMD128-FAST-NEXT: return
 define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
+; SIMD128-LABEL: bitselect_v16i8:
+; SIMD128:         .functype bitselect_v16i8 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $1, $2, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_v16i8:
+; SIMD128-FAST:         .functype bitselect_v16i8 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-FAST-NEXT:    v128.not $push2=, $0
+; SIMD128-FAST-NEXT:    v128.and $push3=, $pop2, $2
+; SIMD128-FAST-NEXT:    v128.or $push1=, $pop0, $pop3
+; SIMD128-FAST-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: bitselect_v16i8:
+; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 15
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
+; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push11=, 14
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.and $push7=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push101=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $pop101
+; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $47
+; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
+; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push17=, 13
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.and $push13=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push100=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $pop100
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $46
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push23=, 12
+; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-NEXT:    i32.and $push19=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push99=, -1
+; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $pop99
+; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $45
+; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
+; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.const $push29=, 11
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.and $push25=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push98=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $12, $pop98
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $44
+; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push35=, 10
+; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-NEXT:    i32.and $push31=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push97=, -1
+; NO-SIMD128-NEXT:    i32.xor $push32=, $11, $pop97
+; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $43
+; NO-SIMD128-NEXT:    i32.or $push34=, $pop31, $pop33
+; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
+; NO-SIMD128-NEXT:    i32.const $push41=, 9
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-NEXT:    i32.and $push37=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push96=, -1
+; NO-SIMD128-NEXT:    i32.xor $push38=, $10, $pop96
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-NEXT:    i32.and $push43=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push95=, -1
+; NO-SIMD128-NEXT:    i32.xor $push44=, $9, $pop95
+; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $41
+; NO-SIMD128-NEXT:    i32.or $push46=, $pop43, $pop45
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
+; NO-SIMD128-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-NEXT:    i32.and $push47=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push94=, -1
+; NO-SIMD128-NEXT:    i32.xor $push48=, $8, $pop94
+; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $40
+; NO-SIMD128-NEXT:    i32.or $push50=, $pop47, $pop49
+; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
+; NO-SIMD128-NEXT:    i32.const $push57=, 6
+; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
+; NO-SIMD128-NEXT:    i32.and $push53=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push93=, -1
+; NO-SIMD128-NEXT:    i32.xor $push54=, $7, $pop93
+; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $39
+; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
+; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
+; NO-SIMD128-NEXT:    i32.const $push63=, 5
+; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
+; NO-SIMD128-NEXT:    i32.and $push59=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push92=, -1
+; NO-SIMD128-NEXT:    i32.xor $push60=, $6, $pop92
+; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $38
+; NO-SIMD128-NEXT:    i32.or $push62=, $pop59, $pop61
+; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
+; NO-SIMD128-NEXT:    i32.and $push65=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push91=, -1
+; NO-SIMD128-NEXT:    i32.xor $push66=, $5, $pop91
+; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $37
+; NO-SIMD128-NEXT:    i32.or $push68=, $pop65, $pop67
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
+; NO-SIMD128-NEXT:    i32.const $push73=, 3
+; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.and $push69=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push90=, -1
+; NO-SIMD128-NEXT:    i32.xor $push70=, $4, $pop90
+; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $36
+; NO-SIMD128-NEXT:    i32.or $push72=, $pop69, $pop71
+; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.and $push75=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push89=, -1
+; NO-SIMD128-NEXT:    i32.xor $push76=, $3, $pop89
+; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $35
+; NO-SIMD128-NEXT:    i32.or $push78=, $pop75, $pop77
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.and $push79=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push88=, -1
+; NO-SIMD128-NEXT:    i32.xor $push80=, $2, $pop88
+; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $34
+; NO-SIMD128-NEXT:    i32.or $push82=, $pop79, $pop81
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.and $push83=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push87=, -1
+; NO-SIMD128-NEXT:    i32.xor $push84=, $1, $pop87
+; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $33
+; NO-SIMD128-NEXT:    i32.or $push86=, $pop83, $pop85
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
+; NO-SIMD128-FAST:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $33
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $37
+; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $38
+; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $39
+; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $40
+; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $9, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $41
+; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $10, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $42
+; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $11, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $43
+; NO-SIMD128-FAST-NEXT:    i32.or $push54=, $pop51, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $12, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $44
+; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $13, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $45
+; NO-SIMD128-FAST-NEXT:    i32.or $push66=, $pop63, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $14, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $46
+; NO-SIMD128-FAST-NEXT:    i32.or $push72=, $pop69, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $15, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $47
+; NO-SIMD128-FAST-NEXT:    i32.or $push78=, $pop75, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $16, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $48
+; NO-SIMD128-FAST-NEXT:    i32.or $push84=, $pop81, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
     <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -343,31 +5728,469 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
   ret <16 x i8> %a
 }
 
-; CHECK-LABEL: bitselect_xor_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_xor_v16i8 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.xor
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.xor
 define <16 x i8> @bitselect_xor_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
+; SIMD128-LABEL: bitselect_xor_v16i8:
+; SIMD128:         .functype bitselect_xor_v16i8 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $1, $2, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_xor_v16i8:
+; SIMD128-FAST:         .functype bitselect_xor_v16i8 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push2=, $1, $2
+; SIMD128-FAST-NEXT:    v128.and $push1=, $pop2, $0
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $pop1, $2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: bitselect_xor_v16i8:
+; NO-SIMD128:         .functype bitselect_xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push3=, 15
+; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
+; NO-SIMD128-NEXT:    i32.store8 0($pop4), $pop2
+; NO-SIMD128-NEXT:    i32.const $push8=, 14
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.xor $push5=, $31, $47
+; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $15
+; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $47
+; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.const $push13=, 13
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push10=, $30, $46
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $14
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $46
+; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 12
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.xor $push15=, $29, $45
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $13
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $45
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
+; NO-SIMD128-NEXT:    i32.const $push23=, 11
+; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-NEXT:    i32.xor $push20=, $28, $44
+; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $12
+; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $44
+; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.const $push28=, 10
+; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
+; NO-SIMD128-NEXT:    i32.xor $push25=, $27, $43
+; NO-SIMD128-NEXT:    i32.and $push26=, $pop25, $11
+; NO-SIMD128-NEXT:    i32.xor $push27=, $pop26, $43
+; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
+; NO-SIMD128-NEXT:    i32.const $push33=, 9
+; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-NEXT:    i32.xor $push30=, $26, $42
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $10
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $42
+; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push35=, $25, $41
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $9
+; NO-SIMD128-NEXT:    i32.xor $push37=, $pop36, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop37
+; NO-SIMD128-NEXT:    i32.const $push41=, 7
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push38=, $24, $40
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $8
+; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $40
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-NEXT:    i32.const $push46=, 6
+; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
+; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $39
+; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $7
+; NO-SIMD128-NEXT:    i32.xor $push45=, $pop44, $39
+; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
+; NO-SIMD128-NEXT:    i32.const $push51=, 5
+; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-NEXT:    i32.xor $push48=, $22, $38
+; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $6
+; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $38
+; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
+; NO-SIMD128-NEXT:    i32.xor $push53=, $21, $37
+; NO-SIMD128-NEXT:    i32.and $push54=, $pop53, $5
+; NO-SIMD128-NEXT:    i32.xor $push55=, $pop54, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
+; NO-SIMD128-NEXT:    i32.const $push59=, 3
+; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-NEXT:    i32.xor $push56=, $20, $36
+; NO-SIMD128-NEXT:    i32.and $push57=, $pop56, $4
+; NO-SIMD128-NEXT:    i32.xor $push58=, $pop57, $36
+; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
+; NO-SIMD128-NEXT:    i32.xor $push61=, $19, $35
+; NO-SIMD128-NEXT:    i32.and $push62=, $pop61, $3
+; NO-SIMD128-NEXT:    i32.xor $push63=, $pop62, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
+; NO-SIMD128-NEXT:    i32.xor $push64=, $18, $34
+; NO-SIMD128-NEXT:    i32.and $push65=, $pop64, $2
+; NO-SIMD128-NEXT:    i32.xor $push66=, $pop65, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
+; NO-SIMD128-NEXT:    i32.xor $push67=, $17, $33
+; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $1
+; NO-SIMD128-NEXT:    i32.xor $push69=, $pop68, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_xor_v16i8:
+; NO-SIMD128-FAST:         .functype bitselect_xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $17, $33
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $33
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $34
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $34
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $19, $35
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $20, $36
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $36
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop23), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $pop33, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $pop38, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push41=, $0, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop41), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $pop48, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $0, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop51), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $pop57, $14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push59=, $pop58, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $pop63, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop64
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $0, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.xor $push67=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $pop68, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop66), $pop69
+; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %and = and <16 x i8> %xor1, %c
  %a = xor <16 x i8> %and, %v2
  ret <16 x i8> %a
 }
 
-; CHECK-LABEL: bitselect_xor_reversed_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_xor_reversed_v16i8 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $2, $1, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.xor
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.xor
 define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
+; SIMD128-LABEL: bitselect_xor_reversed_v16i8:
+; SIMD128:         .functype bitselect_xor_reversed_v16i8 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $2, $1, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_xor_reversed_v16i8:
+; SIMD128-FAST:         .functype bitselect_xor_reversed_v16i8 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push1=, $1, $2
+; SIMD128-FAST-NEXT:    v128.not $push2=, $0
+; SIMD128-FAST-NEXT:    v128.and $push3=, $pop1, $pop2
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $pop3, $2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: bitselect_xor_reversed_v16i8:
+; NO-SIMD128:         .functype bitselect_xor_reversed_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 15
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push2=, $32, $48
+; NO-SIMD128-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $48
+; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push11=, 14
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.xor $push8=, $31, $47
+; NO-SIMD128-NEXT:    i32.const $push101=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop101
+; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $47
+; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push17=, 13
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push14=, $30, $46
+; NO-SIMD128-NEXT:    i32.const $push100=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $14, $pop100
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $46
+; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push23=, 12
+; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-NEXT:    i32.xor $push20=, $29, $45
+; NO-SIMD128-NEXT:    i32.const $push99=, -1
+; NO-SIMD128-NEXT:    i32.xor $push19=, $13, $pop99
+; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $45
+; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.const $push29=, 11
+; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push26=, $28, $44
+; NO-SIMD128-NEXT:    i32.const $push98=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $12, $pop98
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $44
+; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
+; NO-SIMD128-NEXT:    i32.const $push35=, 10
+; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
+; NO-SIMD128-NEXT:    i32.xor $push32=, $27, $43
+; NO-SIMD128-NEXT:    i32.const $push97=, -1
+; NO-SIMD128-NEXT:    i32.xor $push31=, $11, $pop97
+; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $pop31
+; NO-SIMD128-NEXT:    i32.xor $push34=, $pop33, $43
+; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
+; NO-SIMD128-NEXT:    i32.const $push41=, 9
+; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push38=, $26, $42
+; NO-SIMD128-NEXT:    i32.const $push96=, -1
+; NO-SIMD128-NEXT:    i32.xor $push37=, $10, $pop96
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $42
+; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
+; NO-SIMD128-NEXT:    i32.xor $push44=, $25, $41
+; NO-SIMD128-NEXT:    i32.const $push95=, -1
+; NO-SIMD128-NEXT:    i32.xor $push43=, $9, $pop95
+; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.xor $push46=, $pop45, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
+; NO-SIMD128-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
+; NO-SIMD128-NEXT:    i32.xor $push48=, $24, $40
+; NO-SIMD128-NEXT:    i32.const $push94=, -1
+; NO-SIMD128-NEXT:    i32.xor $push47=, $8, $pop94
+; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $pop47
+; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $40
+; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
+; NO-SIMD128-NEXT:    i32.const $push57=, 6
+; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
+; NO-SIMD128-NEXT:    i32.xor $push54=, $23, $39
+; NO-SIMD128-NEXT:    i32.const $push93=, -1
+; NO-SIMD128-NEXT:    i32.xor $push53=, $7, $pop93
+; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.xor $push56=, $pop55, $39
+; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
+; NO-SIMD128-NEXT:    i32.const $push63=, 5
+; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
+; NO-SIMD128-NEXT:    i32.xor $push60=, $22, $38
+; NO-SIMD128-NEXT:    i32.const $push92=, -1
+; NO-SIMD128-NEXT:    i32.xor $push59=, $6, $pop92
+; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $pop59
+; NO-SIMD128-NEXT:    i32.xor $push62=, $pop61, $38
+; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
+; NO-SIMD128-NEXT:    i32.xor $push66=, $21, $37
+; NO-SIMD128-NEXT:    i32.const $push91=, -1
+; NO-SIMD128-NEXT:    i32.xor $push65=, $5, $pop91
+; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $pop65
+; NO-SIMD128-NEXT:    i32.xor $push68=, $pop67, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
+; NO-SIMD128-NEXT:    i32.const $push73=, 3
+; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.xor $push70=, $20, $36
+; NO-SIMD128-NEXT:    i32.const $push90=, -1
+; NO-SIMD128-NEXT:    i32.xor $push69=, $4, $pop90
+; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $pop69
+; NO-SIMD128-NEXT:    i32.xor $push72=, $pop71, $36
+; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.xor $push76=, $19, $35
+; NO-SIMD128-NEXT:    i32.const $push89=, -1
+; NO-SIMD128-NEXT:    i32.xor $push75=, $3, $pop89
+; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $pop75
+; NO-SIMD128-NEXT:    i32.xor $push78=, $pop77, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.xor $push80=, $18, $34
+; NO-SIMD128-NEXT:    i32.const $push88=, -1
+; NO-SIMD128-NEXT:    i32.xor $push79=, $2, $pop88
+; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $pop79
+; NO-SIMD128-NEXT:    i32.xor $push82=, $pop81, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.xor $push84=, $17, $33
+; NO-SIMD128-NEXT:    i32.const $push87=, -1
+; NO-SIMD128-NEXT:    i32.xor $push83=, $1, $pop87
+; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $pop83
+; NO-SIMD128-NEXT:    i32.xor $push86=, $pop85, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v16i8:
+; NO-SIMD128-FAST:         .functype bitselect_xor_reversed_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $17, $33
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $33
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $18, $34
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $34
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $19, $35
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $35
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $20, $36
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $36
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $9, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $10, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.xor $push48=, $pop47, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $11, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $12, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.xor $push60=, $pop59, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push63=, $13, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $pop65, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
+; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $14, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.xor $push72=, $pop71, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push75=, $15, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.xor $push78=, $pop77, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
+; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push81=, $16, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.xor $push84=, $pop83, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %notc = xor <16 x i8> %c, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
                             i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -379,132 +6202,1459 @@ define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16
 ; ==============================================================================
 ; 8 x i16
 ; ==============================================================================
-; CHECK-LABEL: add_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype add_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.add $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: add_v8i16:
+; SIMD128:         .functype add_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.add $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: add_v8i16:
+; SIMD128-FAST:         .functype add_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.add $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: add_v8i16:
+; NO-SIMD128:         .functype add_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.add $push0=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.add $push4=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.add $push7=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.const $push11=, 10
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push14=, 6
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.add $push13=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: add_v8i16:
+; NO-SIMD128-FAST:         .functype add_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.add $push0=, $1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.add $push1=, $2, $10
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: sub_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype sub_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: sub_v8i16:
+; SIMD128:         .functype sub_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.sub $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: sub_v8i16:
+; SIMD128-FAST:         .functype sub_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.sub $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: sub_v8i16:
+; NO-SIMD128:         .functype sub_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.sub $push0=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.sub $push4=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.sub $push7=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.const $push11=, 10
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.sub $push10=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push14=, 6
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.sub $push13=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: sub_v8i16:
+; NO-SIMD128-FAST:         .functype sub_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.sub $push0=, $1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $2, $10
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> %x, %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: mul_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype mul_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: mul_v8i16:
+; SIMD128:         .functype mul_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.mul $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: mul_v8i16:
+; SIMD128-FAST:         .functype mul_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.mul $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: mul_v8i16:
+; NO-SIMD128:         .functype mul_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.mul $push0=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.mul $push4=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.mul $push7=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.const $push11=, 10
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.mul $push10=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push14=, 6
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.mul $push13=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: mul_v8i16:
+; NO-SIMD128-FAST:         .functype mul_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.mul $push0=, $1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.mul $push1=, $2, $10
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    return
   %a = mul <8 x i16> %x, %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: min_s_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype min_s_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.min_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @min_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: min_s_v8i16:
+; SIMD128:         .functype min_s_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.min_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_s_v8i16:
+; SIMD128-FAST:         .functype min_s_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.min_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_s_v8i16:
+; NO-SIMD128:         .functype min_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push4=, 14
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
+; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
+; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.const $push10=, 12
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
+; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
+; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
+; NO-SIMD128-NEXT:    i32.const $push16=, 10
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
+; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
+; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 6
+; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
+; NO-SIMD128-NEXT:    i32.lt_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
+; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.lt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
+; NO-SIMD128-NEXT:    i32.lt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
+; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
+; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_s_v8i16:
+; NO-SIMD128-FAST:         .functype min_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push0=, $9
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $1, $9, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $2
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push4=, $10
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $2, $10, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $3
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push8=, $11
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: min_u_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype min_u_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.min_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @min_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: min_u_v8i16:
+; SIMD128:         .functype min_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.min_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_u_v8i16:
+; SIMD128-FAST:         .functype min_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.min_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_u_v8i16:
+; NO-SIMD128:         .functype min_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push55=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push11=, 12
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.const $push54=, 65535
+; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
+; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push17=, 10
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.const $push52=, 65535
+; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
+; NO-SIMD128-NEXT:    i32.const $push51=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push50=, 65535
+; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
+; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push27=, 6
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.const $push48=, 65535
+; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
+; NO-SIMD128-NEXT:    i32.lt_u $push25=, $pop24, $pop23
+; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
+; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
+; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
+; NO-SIMD128-NEXT:    i32.lt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
+; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_u_v8i16:
+; NO-SIMD128-FAST:         .functype min_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: max_s_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype max_s_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.max_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @max_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: max_s_v8i16:
+; SIMD128:         .functype max_s_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.max_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_s_v8i16:
+; SIMD128-FAST:         .functype max_s_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.max_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_s_v8i16:
+; NO-SIMD128:         .functype max_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push4=, 14
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
+; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
+; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.const $push10=, 12
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
+; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
+; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
+; NO-SIMD128-NEXT:    i32.const $push16=, 10
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
+; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
+; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 6
+; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
+; NO-SIMD128-NEXT:    i32.gt_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
+; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.gt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
+; NO-SIMD128-NEXT:    i32.gt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
+; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
+; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_s_v8i16:
+; NO-SIMD128-FAST:         .functype max_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push0=, $9
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $1, $9, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $2
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push4=, $10
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $2, $10, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $3
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push8=, $11
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: max_u_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype max_u_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.max_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @max_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: max_u_v8i16:
+; SIMD128:         .functype max_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.max_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_u_v8i16:
+; SIMD128-FAST:         .functype max_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.max_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_u_v8i16:
+; NO-SIMD128:         .functype max_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push55=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push11=, 12
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.const $push54=, 65535
+; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
+; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push17=, 10
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.const $push52=, 65535
+; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
+; NO-SIMD128-NEXT:    i32.const $push51=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.const $push50=, 65535
+; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
+; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push27=, 6
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.const $push48=, 65535
+; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
+; NO-SIMD128-NEXT:    i32.gt_u $push25=, $pop24, $pop23
+; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
+; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
+; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
+; NO-SIMD128-NEXT:    i32.gt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
+; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_u_v8i16:
+; NO-SIMD128-FAST:         .functype max_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: avgr_u_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype avgr_u_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.avgr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: avgr_u_v8i16:
+; SIMD128:         .functype avgr_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.avgr_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: avgr_u_v8i16:
+; SIMD128-FAST:         .functype avgr_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.avgr_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: avgr_u_v8i16:
+; NO-SIMD128:         .functype avgr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 14
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push3=, 1
+; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 65534
+; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
+; NO-SIMD128-NEXT:    i32.const $push63=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
+; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
+; NO-SIMD128-NEXT:    i32.const $push62=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
+; NO-SIMD128-NEXT:    i32.const $push61=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
+; NO-SIMD128-NEXT:    i32.const $push60=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
+; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
+; NO-SIMD128-NEXT:    i32.const $push14=, 10
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
+; NO-SIMD128-NEXT:    i32.const $push59=, 1
+; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
+; NO-SIMD128-NEXT:    i32.const $push58=, 65534
+; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
+; NO-SIMD128-NEXT:    i32.const $push57=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
+; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
+; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
+; NO-SIMD128-NEXT:    i32.const $push56=, 1
+; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
+; NO-SIMD128-NEXT:    i32.const $push55=, 65534
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
+; NO-SIMD128-NEXT:    i32.const $push24=, 6
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
+; NO-SIMD128-NEXT:    i32.const $push53=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.const $push51=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
+; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
+; NO-SIMD128-NEXT:    i32.const $push50=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.const $push48=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push47=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 65534
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.const $push45=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push44=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 65534
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.const $push42=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: avgr_u_v8i16:
+; NO-SIMD128-FAST:         .functype avgr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.add $push0=, $1, $9
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <8 x i16> %x, %y
   %b = add nuw <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   %c = udiv <8 x i16> %b, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <8 x i16> %c
 }
 
-; CHECK-LABEL: avgr_u_v8i16_wrap:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype avgr_u_v8i16_wrap (v128, v128) -> (v128){{$}}
-; SIMD128-NOT: i16x8.avgr_u
 define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: avgr_u_v8i16_wrap:
+; SIMD128:         .functype avgr_u_v8i16_wrap (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.add $push0=, $0, $1
+; SIMD128-NEXT:    v128.const $push1=, 1, 1, 1, 1, 1, 1, 1, 1
+; SIMD128-NEXT:    i16x8.add $push2=, $pop0, $pop1
+; SIMD128-NEXT:    i32.const $push3=, 1
+; SIMD128-NEXT:    i16x8.shr_u $push4=, $pop2, $pop3
+; SIMD128-NEXT:    return $pop4
+;
+; SIMD128-FAST-LABEL: avgr_u_v8i16_wrap:
+; SIMD128-FAST:         .functype avgr_u_v8i16_wrap (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.add $push2=, $0, $1
+; SIMD128-FAST-NEXT:    v128.const $push3=, 1, 1, 1, 1, 1, 1, 1, 1
+; SIMD128-FAST-NEXT:    i16x8.add $push1=, $pop2, $pop3
+; SIMD128-FAST-NEXT:    i32.const $push4=, 1
+; SIMD128-FAST-NEXT:    i16x8.shr_u $push0=, $pop1, $pop4
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: avgr_u_v8i16_wrap:
+; NO-SIMD128:         .functype avgr_u_v8i16_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 14
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push3=, 1
+; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 65534
+; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
+; NO-SIMD128-NEXT:    i32.const $push63=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
+; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
+; NO-SIMD128-NEXT:    i32.const $push62=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
+; NO-SIMD128-NEXT:    i32.const $push61=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
+; NO-SIMD128-NEXT:    i32.const $push60=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
+; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
+; NO-SIMD128-NEXT:    i32.const $push14=, 10
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
+; NO-SIMD128-NEXT:    i32.const $push59=, 1
+; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
+; NO-SIMD128-NEXT:    i32.const $push58=, 65534
+; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
+; NO-SIMD128-NEXT:    i32.const $push57=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
+; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
+; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
+; NO-SIMD128-NEXT:    i32.const $push56=, 1
+; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
+; NO-SIMD128-NEXT:    i32.const $push55=, 65534
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
+; NO-SIMD128-NEXT:    i32.const $push24=, 6
+; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
+; NO-SIMD128-NEXT:    i32.const $push53=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.const $push51=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
+; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
+; NO-SIMD128-NEXT:    i32.const $push50=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.const $push48=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push47=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 65534
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.const $push45=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push44=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 65534
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.const $push42=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: avgr_u_v8i16_wrap:
+; NO-SIMD128-FAST:         .functype avgr_u_v8i16_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.add $push0=, $1, $9
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   %b = add <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   %c = udiv <8 x i16> %b, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <8 x i16> %c
 }
 
-; CHECK-LABEL: abs_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype abs_v8i16 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.abs $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @abs_v8i16(<8 x i16> %x) {
+; SIMD128-LABEL: abs_v8i16:
+; SIMD128:         .functype abs_v8i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.abs $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: abs_v8i16:
+; SIMD128-FAST:         .functype abs_v8i16 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.abs $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: abs_v8i16:
+; NO-SIMD128:         .functype abs_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push4=, 14
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $8
+; NO-SIMD128-NEXT:    i32.const $push1=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push55=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push54=, $9=, $pop55
+; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop54
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $7
+; NO-SIMD128-NEXT:    i32.const $push53=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop6, $pop53
+; NO-SIMD128-NEXT:    local.tee $push51=, $8=, $pop52
+; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop51
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $8
+; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push14=, 10
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $6
+; NO-SIMD128-NEXT:    i32.const $push50=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push49=, $pop11, $pop50
+; NO-SIMD128-NEXT:    local.tee $push48=, $8=, $pop49
+; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $pop48
+; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $8
+; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $5
+; NO-SIMD128-NEXT:    i32.const $push47=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push46=, $pop16, $pop47
+; NO-SIMD128-NEXT:    local.tee $push45=, $8=, $pop46
+; NO-SIMD128-NEXT:    i32.xor $push17=, $5, $pop45
+; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $8
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push22=, 6
+; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $4
+; NO-SIMD128-NEXT:    i32.const $push44=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop19, $pop44
+; NO-SIMD128-NEXT:    local.tee $push42=, $8=, $pop43
+; NO-SIMD128-NEXT:    i32.xor $push20=, $4, $pop42
+; NO-SIMD128-NEXT:    i32.sub $push21=, $pop20, $8
+; NO-SIMD128-NEXT:    i32.store16 0($pop23), $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $3
+; NO-SIMD128-NEXT:    i32.const $push41=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop24, $pop41
+; NO-SIMD128-NEXT:    local.tee $push39=, $8=, $pop40
+; NO-SIMD128-NEXT:    i32.xor $push25=, $3, $pop39
+; NO-SIMD128-NEXT:    i32.sub $push26=, $pop25, $8
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop26
+; NO-SIMD128-NEXT:    i32.extend16_s $push27=, $2
+; NO-SIMD128-NEXT:    i32.const $push38=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push37=, $pop27, $pop38
+; NO-SIMD128-NEXT:    local.tee $push36=, $8=, $pop37
+; NO-SIMD128-NEXT:    i32.xor $push28=, $2, $pop36
+; NO-SIMD128-NEXT:    i32.sub $push29=, $pop28, $8
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
+; NO-SIMD128-NEXT:    i32.extend16_s $push30=, $1
+; NO-SIMD128-NEXT:    i32.const $push35=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push34=, $pop30, $pop35
+; NO-SIMD128-NEXT:    local.tee $push33=, $8=, $pop34
+; NO-SIMD128-NEXT:    i32.xor $push31=, $1, $pop33
+; NO-SIMD128-NEXT:    i32.sub $push32=, $pop31, $8
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: abs_v8i16:
+; NO-SIMD128-FAST:         .functype abs_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push0=, $1
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push54=, $9=, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push4=, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push52=, $pop4, $pop53
+; NO-SIMD128-FAST-NEXT:    local.tee $push51=, $1=, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push49=, $pop7, $pop50
+; NO-SIMD128-FAST-NEXT:    local.tee $push48=, $2=, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop10, $pop47
+; NO-SIMD128-FAST-NEXT:    local.tee $push45=, $3=, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push43=, $pop15, $pop44
+; NO-SIMD128-FAST-NEXT:    local.tee $push42=, $4=, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop18, $pop41
+; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $5=, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push37=, $pop23, $pop38
+; NO-SIMD128-FAST-NEXT:    local.tee $push36=, $6=, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop28, $pop35
+; NO-SIMD128-FAST-NEXT:    local.tee $push33=, $0=, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $0
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> zeroinitializer, %x
   %b = icmp slt <8 x i16> %x, zeroinitializer
   %c = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %x
   ret <8 x i16> %c
 }
 
-; CHECK-LABEL: neg_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype neg_v8i16 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.neg $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @neg_v8i16(<8 x i16> %x) {
+; SIMD128-LABEL: neg_v8i16:
+; SIMD128:         .functype neg_v8i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.neg $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: neg_v8i16:
+; SIMD128-FAST:         .functype neg_v8i16 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.neg $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: neg_v8i16:
+; NO-SIMD128:         .functype neg_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $5
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop23, $3
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop22, $2
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop21, $1
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 14
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop20, $8
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop19, $7
+; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 10
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-NEXT:    i32.sub $push11=, $pop18, $6
+; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 6
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-NEXT:    i32.sub $push14=, $pop17, $4
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: neg_v8i16:
+; NO-SIMD128-FAST:         .functype neg_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop23, $2
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop22, $3
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop21, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop20, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop19, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop17, $8
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
                      %x
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: shl_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype shl_v8i16 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.shl $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
+; SIMD128-LABEL: shl_v8i16:
+; SIMD128:         .functype shl_v8i16 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.shl $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shl_v8i16:
+; SIMD128-FAST:         .functype shl_v8i16 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.shl $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_v8i16:
+; NO-SIMD128:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push17=, $9=, $pop18
+; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop17
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $9
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 14
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 10
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 6
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_v8i16:
+; NO-SIMD128-FAST:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $9=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
     <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -512,46 +7662,391 @@ define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: shl_const_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype shl_const_v8i16 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5
-; SIMD128-NEXT: i16x8.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
+; SIMD128-LABEL: shl_const_v8i16:
+; SIMD128:         .functype shl_const_v8i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32.const $push0=, 5
+; SIMD128-NEXT:    i16x8.shl $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: shl_const_v8i16:
+; SIMD128-FAST:         .functype shl_const_v8i16 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32.const $push1=, 5
+; SIMD128-FAST-NEXT:    i16x8.shl $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_const_v8i16:
+; NO-SIMD128:         .functype shl_const_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 5
+; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop0
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 14
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $pop20
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 10
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $pop18
+; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 6
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $pop17
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_const_v8i16:
+; NO-SIMD128-FAST:         .functype shl_const_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v,
     <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: shl_vec_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype shl_vec_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i32.const $push[[M0:[0-9]+]]=, 15{{$}}
-; SIMD128-NEXT: i32.and $push[[M1:[0-9]+]]=, $pop[[L1]], $pop[[M0]]{{$}}
-; SIMD128-NEXT: i32.shl $push[[M2:[0-9]+]]=, $pop[[L0]], $pop[[M1]]{{$}}
-; SIMD128-NEXT: i16x8.splat $push[[M3:[0-9]+]]=, $pop[[M2]]{{$}}
-; Skip 6 lanes
-; SIMD128:      i16x8.extract_lane_u $push[[L4:[0-9]+]]=, $0, 7{{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[L5:[0-9]+]]=, $1, 7{{$}}
-; SIMD128-NEXT: i32.const $push[[M4:[0-9]+]]=, 15{{$}}
-; SIMD128-NEXT: i32.and $push[[M5:[0-9]+]]=, $pop[[L5]], $pop[[M4]]{{$}}
-; SIMD128-NEXT: i32.shl $push[[M6:[0-9]+]]=, $pop[[L4]], $pop[[M5]]{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[M7:[0-9]+]], 7, $pop[[M6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
+; SIMD128-LABEL: shl_vec_v8i16:
+; SIMD128:         .functype shl_vec_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extract_lane_u $push7=, $0, 0
+; SIMD128-NEXT:    i16x8.extract_lane_u $push5=, $1, 0
+; SIMD128-NEXT:    i32.const $push1=, 15
+; SIMD128-NEXT:    i32.and $push6=, $pop5, $pop1
+; SIMD128-NEXT:    i32.shl $push8=, $pop7, $pop6
+; SIMD128-NEXT:    i16x8.splat $push9=, $pop8
+; SIMD128-NEXT:    i16x8.extract_lane_u $push3=, $0, 1
+; SIMD128-NEXT:    i16x8.extract_lane_u $push0=, $1, 1
+; SIMD128-NEXT:    i32.const $push47=, 15
+; SIMD128-NEXT:    i32.and $push2=, $pop0, $pop47
+; SIMD128-NEXT:    i32.shl $push4=, $pop3, $pop2
+; SIMD128-NEXT:    i16x8.replace_lane $push10=, $pop9, 1, $pop4
+; SIMD128-NEXT:    i16x8.extract_lane_u $push13=, $0, 2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push11=, $1, 2
+; SIMD128-NEXT:    i32.const $push46=, 15
+; SIMD128-NEXT:    i32.and $push12=, $pop11, $pop46
+; SIMD128-NEXT:    i32.shl $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i16x8.replace_lane $push15=, $pop10, 2, $pop14
+; SIMD128-NEXT:    i16x8.extract_lane_u $push18=, $0, 3
+; SIMD128-NEXT:    i16x8.extract_lane_u $push16=, $1, 3
+; SIMD128-NEXT:    i32.const $push45=, 15
+; SIMD128-NEXT:    i32.and $push17=, $pop16, $pop45
+; SIMD128-NEXT:    i32.shl $push19=, $pop18, $pop17
+; SIMD128-NEXT:    i16x8.replace_lane $push20=, $pop15, 3, $pop19
+; SIMD128-NEXT:    i16x8.extract_lane_u $push23=, $0, 4
+; SIMD128-NEXT:    i16x8.extract_lane_u $push21=, $1, 4
+; SIMD128-NEXT:    i32.const $push44=, 15
+; SIMD128-NEXT:    i32.and $push22=, $pop21, $pop44
+; SIMD128-NEXT:    i32.shl $push24=, $pop23, $pop22
+; SIMD128-NEXT:    i16x8.replace_lane $push25=, $pop20, 4, $pop24
+; SIMD128-NEXT:    i16x8.extract_lane_u $push28=, $0, 5
+; SIMD128-NEXT:    i16x8.extract_lane_u $push26=, $1, 5
+; SIMD128-NEXT:    i32.const $push43=, 15
+; SIMD128-NEXT:    i32.and $push27=, $pop26, $pop43
+; SIMD128-NEXT:    i32.shl $push29=, $pop28, $pop27
+; SIMD128-NEXT:    i16x8.replace_lane $push30=, $pop25, 5, $pop29
+; SIMD128-NEXT:    i16x8.extract_lane_u $push33=, $0, 6
+; SIMD128-NEXT:    i16x8.extract_lane_u $push31=, $1, 6
+; SIMD128-NEXT:    i32.const $push42=, 15
+; SIMD128-NEXT:    i32.and $push32=, $pop31, $pop42
+; SIMD128-NEXT:    i32.shl $push34=, $pop33, $pop32
+; SIMD128-NEXT:    i16x8.replace_lane $push35=, $pop30, 6, $pop34
+; SIMD128-NEXT:    i16x8.extract_lane_u $push38=, $0, 7
+; SIMD128-NEXT:    i16x8.extract_lane_u $push36=, $1, 7
+; SIMD128-NEXT:    i32.const $push41=, 15
+; SIMD128-NEXT:    i32.and $push37=, $pop36, $pop41
+; SIMD128-NEXT:    i32.shl $push39=, $pop38, $pop37
+; SIMD128-NEXT:    i16x8.replace_lane $push40=, $pop35, 7, $pop39
+; SIMD128-NEXT:    return $pop40
+;
+; SIMD128-FAST-LABEL: shl_vec_v8i16:
+; SIMD128-FAST:         .functype shl_vec_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push8=, $0, 0
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push6=, $1, 0
+; SIMD128-FAST-NEXT:    i32.const $push2=, 15
+; SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop2
+; SIMD128-FAST-NEXT:    i32.shl $push9=, $pop8, $pop7
+; SIMD128-FAST-NEXT:    i16x8.splat $push10=, $pop9
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push4=, $0, 1
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.const $push47=, 15
+; SIMD128-FAST-NEXT:    i32.and $push3=, $pop1, $pop47
+; SIMD128-FAST-NEXT:    i32.shl $push5=, $pop4, $pop3
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push11=, $pop10, 1, $pop5
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push14=, $0, 2
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push12=, $1, 2
+; SIMD128-FAST-NEXT:    i32.const $push46=, 15
+; SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $pop46
+; SIMD128-FAST-NEXT:    i32.shl $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push16=, $pop11, 2, $pop15
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push19=, $0, 3
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push17=, $1, 3
+; SIMD128-FAST-NEXT:    i32.const $push45=, 15
+; SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop45
+; SIMD128-FAST-NEXT:    i32.shl $push20=, $pop19, $pop18
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push21=, $pop16, 3, $pop20
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push24=, $0, 4
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push22=, $1, 4
+; SIMD128-FAST-NEXT:    i32.const $push44=, 15
+; SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop44
+; SIMD128-FAST-NEXT:    i32.shl $push25=, $pop24, $pop23
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push26=, $pop21, 4, $pop25
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push29=, $0, 5
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push27=, $1, 5
+; SIMD128-FAST-NEXT:    i32.const $push43=, 15
+; SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop43
+; SIMD128-FAST-NEXT:    i32.shl $push30=, $pop29, $pop28
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push31=, $pop26, 5, $pop30
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push34=, $0, 6
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push32=, $1, 6
+; SIMD128-FAST-NEXT:    i32.const $push42=, 15
+; SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $pop42
+; SIMD128-FAST-NEXT:    i32.shl $push35=, $pop34, $pop33
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push36=, $pop31, 6, $pop35
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push39=, $0, 7
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push37=, $1, 7
+; SIMD128-FAST-NEXT:    i32.const $push41=, 15
+; SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $pop41
+; SIMD128-FAST-NEXT:    i32.shl $push40=, $pop39, $pop38
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push0=, $pop36, 7, $pop40
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_vec_v8i16:
+; NO-SIMD128:         .functype shl_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop1
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $11, $pop31
+; NO-SIMD128-NEXT:    i32.shl $push4=, $3, $pop3
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $10, $pop30
+; NO-SIMD128-NEXT:    i32.shl $push6=, $2, $pop5
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop29
+; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $pop7
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push11=, 14
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $16, $pop28
+; NO-SIMD128-NEXT:    i32.shl $push10=, $8, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $15, $pop27
+; NO-SIMD128-NEXT:    i32.shl $push14=, $7, $pop13
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push19=, 10
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $14, $pop26
+; NO-SIMD128-NEXT:    i32.shl $push18=, $6, $pop17
+; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push23=, 6
+; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $12, $pop25
+; NO-SIMD128-NEXT:    i32.shl $push22=, $4, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_vec_v8i16:
+; NO-SIMD128-FAST:         .functype shl_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $12, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $13, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v, %x
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: shr_s_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype shr_s_v8i16 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.shr_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
+; SIMD128-LABEL: shr_s_v8i16:
+; SIMD128:         .functype shr_s_v8i16 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.shr_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_s_v8i16:
+; SIMD128-FAST:         .functype shr_s_v8i16 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.shr_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_v8i16:
+; NO-SIMD128:         .functype shr_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $5
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push26=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push25=, $9=, $pop26
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop25
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $3
+; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $2
+; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $9
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
+; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push11=, 14
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $8
+; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $7
+; NO-SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push19=, 10
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $6
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push23=, 6
+; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $4
+; NO-SIMD128-NEXT:    i32.shr_s $push22=, $pop21, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_v8i16:
+; NO-SIMD128-FAST:         .functype shr_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push25=, $1=, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push3=, $2
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $3
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
     <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -559,34 +8054,330 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: shr_s_vec_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype shr_s_vec_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.extract_lane_s $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i32.const $push[[M0:[0-9]+]]=, 15{{$}}
-; SIMD128-NEXT: i32.and $push[[M1:[0-9]+]]=, $pop[[L1]], $pop[[M0]]{{$}}
-; SIMD128-NEXT: i32.shr_s $push[[M2:[0-9]+]]=, $pop[[L0]], $pop[[M1]]{{$}}
-; SIMD128-NEXT: i16x8.splat $push[[M3:[0-9]+]]=, $pop[[M2]]{{$}}
-; Skip 6 lanes
-; SIMD128:      i16x8.extract_lane_s $push[[L4:[0-9]+]]=, $0, 7{{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[L5:[0-9]+]]=, $1, 7{{$}}
-; SIMD128-NEXT: i32.const $push[[M4:[0-9]+]]=, 15{{$}}
-; SIMD128-NEXT: i32.and $push[[M5:[0-9]+]]=, $pop[[L5]], $pop[[M4]]{{$}}
-; SIMD128-NEXT: i32.shr_s $push[[M6:[0-9]+]]=, $pop[[L4]], $pop[[M5]]{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[M7:[0-9]+]], 7, $pop[[M6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
+; SIMD128-LABEL: shr_s_vec_v8i16:
+; SIMD128:         .functype shr_s_vec_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extract_lane_s $push7=, $0, 0
+; SIMD128-NEXT:    i16x8.extract_lane_u $push5=, $1, 0
+; SIMD128-NEXT:    i32.const $push1=, 15
+; SIMD128-NEXT:    i32.and $push6=, $pop5, $pop1
+; SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $pop6
+; SIMD128-NEXT:    i16x8.splat $push9=, $pop8
+; SIMD128-NEXT:    i16x8.extract_lane_s $push3=, $0, 1
+; SIMD128-NEXT:    i16x8.extract_lane_u $push0=, $1, 1
+; SIMD128-NEXT:    i32.const $push47=, 15
+; SIMD128-NEXT:    i32.and $push2=, $pop0, $pop47
+; SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $pop2
+; SIMD128-NEXT:    i16x8.replace_lane $push10=, $pop9, 1, $pop4
+; SIMD128-NEXT:    i16x8.extract_lane_s $push13=, $0, 2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push11=, $1, 2
+; SIMD128-NEXT:    i32.const $push46=, 15
+; SIMD128-NEXT:    i32.and $push12=, $pop11, $pop46
+; SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i16x8.replace_lane $push15=, $pop10, 2, $pop14
+; SIMD128-NEXT:    i16x8.extract_lane_s $push18=, $0, 3
+; SIMD128-NEXT:    i16x8.extract_lane_u $push16=, $1, 3
+; SIMD128-NEXT:    i32.const $push45=, 15
+; SIMD128-NEXT:    i32.and $push17=, $pop16, $pop45
+; SIMD128-NEXT:    i32.shr_s $push19=, $pop18, $pop17
+; SIMD128-NEXT:    i16x8.replace_lane $push20=, $pop15, 3, $pop19
+; SIMD128-NEXT:    i16x8.extract_lane_s $push23=, $0, 4
+; SIMD128-NEXT:    i16x8.extract_lane_u $push21=, $1, 4
+; SIMD128-NEXT:    i32.const $push44=, 15
+; SIMD128-NEXT:    i32.and $push22=, $pop21, $pop44
+; SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; SIMD128-NEXT:    i16x8.replace_lane $push25=, $pop20, 4, $pop24
+; SIMD128-NEXT:    i16x8.extract_lane_s $push28=, $0, 5
+; SIMD128-NEXT:    i16x8.extract_lane_u $push26=, $1, 5
+; SIMD128-NEXT:    i32.const $push43=, 15
+; SIMD128-NEXT:    i32.and $push27=, $pop26, $pop43
+; SIMD128-NEXT:    i32.shr_s $push29=, $pop28, $pop27
+; SIMD128-NEXT:    i16x8.replace_lane $push30=, $pop25, 5, $pop29
+; SIMD128-NEXT:    i16x8.extract_lane_s $push33=, $0, 6
+; SIMD128-NEXT:    i16x8.extract_lane_u $push31=, $1, 6
+; SIMD128-NEXT:    i32.const $push42=, 15
+; SIMD128-NEXT:    i32.and $push32=, $pop31, $pop42
+; SIMD128-NEXT:    i32.shr_s $push34=, $pop33, $pop32
+; SIMD128-NEXT:    i16x8.replace_lane $push35=, $pop30, 6, $pop34
+; SIMD128-NEXT:    i16x8.extract_lane_s $push38=, $0, 7
+; SIMD128-NEXT:    i16x8.extract_lane_u $push36=, $1, 7
+; SIMD128-NEXT:    i32.const $push41=, 15
+; SIMD128-NEXT:    i32.and $push37=, $pop36, $pop41
+; SIMD128-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; SIMD128-NEXT:    i16x8.replace_lane $push40=, $pop35, 7, $pop39
+; SIMD128-NEXT:    return $pop40
+;
+; SIMD128-FAST-LABEL: shr_s_vec_v8i16:
+; SIMD128-FAST:         .functype shr_s_vec_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_s $push8=, $0, 0
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push6=, $1, 0
+; SIMD128-FAST-NEXT:    i32.const $push2=, 15
+; SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop2
+; SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
+; SIMD128-FAST-NEXT:    i16x8.splat $push10=, $pop9
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_s $push4=, $0, 1
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.const $push47=, 15
+; SIMD128-FAST-NEXT:    i32.and $push3=, $pop1, $pop47
+; SIMD128-FAST-NEXT:    i32.shr_s $push5=, $pop4, $pop3
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push11=, $pop10, 1, $pop5
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_s $push14=, $0, 2
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push12=, $1, 2
+; SIMD128-FAST-NEXT:    i32.const $push46=, 15
+; SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $pop46
+; SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push16=, $pop11, 2, $pop15
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_s $push19=, $0, 3
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push17=, $1, 3
+; SIMD128-FAST-NEXT:    i32.const $push45=, 15
+; SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop45
+; SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $pop18
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push21=, $pop16, 3, $pop20
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_s $push24=, $0, 4
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push22=, $1, 4
+; SIMD128-FAST-NEXT:    i32.const $push44=, 15
+; SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop44
+; SIMD128-FAST-NEXT:    i32.shr_s $push25=, $pop24, $pop23
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push26=, $pop21, 4, $pop25
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_s $push29=, $0, 5
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push27=, $1, 5
+; SIMD128-FAST-NEXT:    i32.const $push43=, 15
+; SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop43
+; SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push31=, $pop26, 5, $pop30
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_s $push34=, $0, 6
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push32=, $1, 6
+; SIMD128-FAST-NEXT:    i32.const $push42=, 15
+; SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $pop42
+; SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop34, $pop33
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push36=, $pop31, 6, $pop35
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_s $push39=, $0, 7
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push37=, $1, 7
+; SIMD128-FAST-NEXT:    i32.const $push41=, 15
+; SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $pop41
+; SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop39, $pop38
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push0=, $pop36, 7, $pop40
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_vec_v8i16:
+; NO-SIMD128:         .functype shr_s_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend16_s $push2=, $5
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $3
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop39
+; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $2
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop38
+; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $1
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop37
+; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push16=, 14
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.extend16_s $push14=, $8
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop36
+; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push21=, 12
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $7
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop35
+; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push26=, 10
+; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $6
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop34
+; NO-SIMD128-NEXT:    i32.shr_s $push25=, $pop24, $pop23
+; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-NEXT:    i32.const $push31=, 6
+; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $4
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
+; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_vec_v8i16:
+; NO-SIMD128-FAST:         .functype shr_s_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push2=, $1
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push8=, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $12, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $14, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop19), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push26=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push31=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $16, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop29), $pop32
+; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <8 x i16> %v, %x
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: shr_u_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype shr_u_v8i16 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.shr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
+; SIMD128-LABEL: shr_u_v8i16:
+; SIMD128:         .functype shr_u_v8i16 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.shr_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_u_v8i16:
+; SIMD128-FAST:         .functype shr_u_v8i16 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.shr_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_v8i16:
+; NO-SIMD128:         .functype shr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $5, $pop0
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push33=, $9, $pop34
+; NO-SIMD128-NEXT:    local.tee $push32=, $9=, $pop33
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop32
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $3, $pop31
+; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop30
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $9
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $1, $pop29
+; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push11=, 14
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $8, $pop28
+; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $7, $pop27
+; NO-SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push19=, 10
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $6, $pop26
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push23=, 6
+; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $4, $pop25
+; NO-SIMD128-NEXT:    i32.shr_u $push22=, $pop21, $9
+; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_v8i16:
+; NO-SIMD128-FAST:         .functype shr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $pop34
+; NO-SIMD128-FAST-NEXT:    local.tee $push32=, $1=, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop20), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
     <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -594,95 +8385,797 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: shr_u_vec_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype shr_u_vec_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i32.const $push[[M0:[0-9]+]]=, 15{{$}}
-; SIMD128-NEXT: i32.and $push[[M1:[0-9]+]]=, $pop[[L1]], $pop[[M0]]{{$}}
-; SIMD128-NEXT: i32.shr_u $push[[M2:[0-9]+]]=, $pop[[L0]], $pop[[M1]]{{$}}
-; SIMD128-NEXT: i16x8.splat $push[[M3:[0-9]+]]=, $pop[[M2]]{{$}}
-; Skip 6 lanes
-; SIMD128:      i16x8.extract_lane_u $push[[L4:[0-9]+]]=, $0, 7{{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[L5:[0-9]+]]=, $1, 7{{$}}
-; SIMD128-NEXT: i32.const $push[[M4:[0-9]+]]=, 15{{$}}
-; SIMD128-NEXT: i32.and $push[[M5:[0-9]+]]=, $pop[[L5]], $pop[[M4]]{{$}}
-; SIMD128-NEXT: i32.shr_u $push[[M6:[0-9]+]]=, $pop[[L4]], $pop[[M5]]{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[M7:[0-9]+]], 7, $pop[[M6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
+; SIMD128-LABEL: shr_u_vec_v8i16:
+; SIMD128:         .functype shr_u_vec_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extract_lane_u $push7=, $0, 0
+; SIMD128-NEXT:    i16x8.extract_lane_u $push5=, $1, 0
+; SIMD128-NEXT:    i32.const $push1=, 15
+; SIMD128-NEXT:    i32.and $push6=, $pop5, $pop1
+; SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $pop6
+; SIMD128-NEXT:    i16x8.splat $push9=, $pop8
+; SIMD128-NEXT:    i16x8.extract_lane_u $push3=, $0, 1
+; SIMD128-NEXT:    i16x8.extract_lane_u $push0=, $1, 1
+; SIMD128-NEXT:    i32.const $push47=, 15
+; SIMD128-NEXT:    i32.and $push2=, $pop0, $pop47
+; SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $pop2
+; SIMD128-NEXT:    i16x8.replace_lane $push10=, $pop9, 1, $pop4
+; SIMD128-NEXT:    i16x8.extract_lane_u $push13=, $0, 2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push11=, $1, 2
+; SIMD128-NEXT:    i32.const $push46=, 15
+; SIMD128-NEXT:    i32.and $push12=, $pop11, $pop46
+; SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i16x8.replace_lane $push15=, $pop10, 2, $pop14
+; SIMD128-NEXT:    i16x8.extract_lane_u $push18=, $0, 3
+; SIMD128-NEXT:    i16x8.extract_lane_u $push16=, $1, 3
+; SIMD128-NEXT:    i32.const $push45=, 15
+; SIMD128-NEXT:    i32.and $push17=, $pop16, $pop45
+; SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop17
+; SIMD128-NEXT:    i16x8.replace_lane $push20=, $pop15, 3, $pop19
+; SIMD128-NEXT:    i16x8.extract_lane_u $push23=, $0, 4
+; SIMD128-NEXT:    i16x8.extract_lane_u $push21=, $1, 4
+; SIMD128-NEXT:    i32.const $push44=, 15
+; SIMD128-NEXT:    i32.and $push22=, $pop21, $pop44
+; SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; SIMD128-NEXT:    i16x8.replace_lane $push25=, $pop20, 4, $pop24
+; SIMD128-NEXT:    i16x8.extract_lane_u $push28=, $0, 5
+; SIMD128-NEXT:    i16x8.extract_lane_u $push26=, $1, 5
+; SIMD128-NEXT:    i32.const $push43=, 15
+; SIMD128-NEXT:    i32.and $push27=, $pop26, $pop43
+; SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop27
+; SIMD128-NEXT:    i16x8.replace_lane $push30=, $pop25, 5, $pop29
+; SIMD128-NEXT:    i16x8.extract_lane_u $push33=, $0, 6
+; SIMD128-NEXT:    i16x8.extract_lane_u $push31=, $1, 6
+; SIMD128-NEXT:    i32.const $push42=, 15
+; SIMD128-NEXT:    i32.and $push32=, $pop31, $pop42
+; SIMD128-NEXT:    i32.shr_u $push34=, $pop33, $pop32
+; SIMD128-NEXT:    i16x8.replace_lane $push35=, $pop30, 6, $pop34
+; SIMD128-NEXT:    i16x8.extract_lane_u $push38=, $0, 7
+; SIMD128-NEXT:    i16x8.extract_lane_u $push36=, $1, 7
+; SIMD128-NEXT:    i32.const $push41=, 15
+; SIMD128-NEXT:    i32.and $push37=, $pop36, $pop41
+; SIMD128-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; SIMD128-NEXT:    i16x8.replace_lane $push40=, $pop35, 7, $pop39
+; SIMD128-NEXT:    return $pop40
+;
+; SIMD128-FAST-LABEL: shr_u_vec_v8i16:
+; SIMD128-FAST:         .functype shr_u_vec_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push8=, $0, 0
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push6=, $1, 0
+; SIMD128-FAST-NEXT:    i32.const $push2=, 15
+; SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop2
+; SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; SIMD128-FAST-NEXT:    i16x8.splat $push10=, $pop9
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push4=, $0, 1
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.const $push47=, 15
+; SIMD128-FAST-NEXT:    i32.and $push3=, $pop1, $pop47
+; SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop3
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push11=, $pop10, 1, $pop5
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push14=, $0, 2
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push12=, $1, 2
+; SIMD128-FAST-NEXT:    i32.const $push46=, 15
+; SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $pop46
+; SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push16=, $pop11, 2, $pop15
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push19=, $0, 3
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push17=, $1, 3
+; SIMD128-FAST-NEXT:    i32.const $push45=, 15
+; SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop45
+; SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push21=, $pop16, 3, $pop20
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push24=, $0, 4
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push22=, $1, 4
+; SIMD128-FAST-NEXT:    i32.const $push44=, 15
+; SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop44
+; SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push26=, $pop21, 4, $pop25
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push29=, $0, 5
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push27=, $1, 5
+; SIMD128-FAST-NEXT:    i32.const $push43=, 15
+; SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop43
+; SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push31=, $pop26, 5, $pop30
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push34=, $0, 6
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push32=, $1, 6
+; SIMD128-FAST-NEXT:    i32.const $push42=, 15
+; SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $pop42
+; SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop33
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push36=, $pop31, 6, $pop35
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push39=, $0, 7
+; SIMD128-FAST-NEXT:    i16x8.extract_lane_u $push37=, $1, 7
+; SIMD128-FAST-NEXT:    i32.const $push41=, 15
+; SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $pop41
+; SIMD128-FAST-NEXT:    i32.shr_u $push40=, $pop39, $pop38
+; SIMD128-FAST-NEXT:    i16x8.replace_lane $push0=, $pop36, 7, $pop40
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_vec_v8i16:
+; NO-SIMD128:         .functype shr_u_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop47
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop45
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop43
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop41
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push16=, 14
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop39
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push21=, 12
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop37
+; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push26=, 10
+; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop35
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop23
+; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-NEXT:    i32.const $push31=, 6
+; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
+; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_vec_v8i16:
+; NO-SIMD128-FAST:         .functype shr_u_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $14, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $15, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $16, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <8 x i16> %v, %x
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: and_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype and_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: and_v8i16:
+; SIMD128:         .functype and_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: and_v8i16:
+; SIMD128-FAST:         .functype and_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: and_v8i16:
+; NO-SIMD128:         .functype and_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.and $push0=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.and $push4=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.and $push7=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.const $push11=, 10
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push14=, 6
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.and $push13=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: and_v8i16:
+; NO-SIMD128-FAST:         .functype and_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $2, $10
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    return
   %a = and <8 x i16> %x, %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: or_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype or_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: or_v8i16:
+; SIMD128:         .functype or_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.or $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: or_v8i16:
+; SIMD128-FAST:         .functype or_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.or $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: or_v8i16:
+; NO-SIMD128:         .functype or_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.or $push0=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.or $push4=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.or $push7=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.const $push11=, 10
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.or $push10=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push14=, 6
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.or $push13=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: or_v8i16:
+; NO-SIMD128-FAST:         .functype or_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.or $push0=, $1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.or $push1=, $2, $10
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    return
   %a = or <8 x i16> %x, %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: xor_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype xor_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.xor $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: xor_v8i16:
+; SIMD128:         .functype xor_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.xor $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: xor_v8i16:
+; SIMD128-FAST:         .functype xor_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: xor_v8i16:
+; NO-SIMD128:         .functype xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.xor $push0=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push4=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.const $push11=, 10
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push14=, 6
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.xor $push13=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: xor_v8i16:
+; NO-SIMD128-FAST:         .functype xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $2, $10
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, %y
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: not_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype not_v8i16 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @not_v8i16(<8 x i16> %x) {
+; SIMD128-LABEL: not_v8i16:
+; SIMD128:         .functype not_v8i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.not $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: not_v8i16:
+; SIMD128-FAST:         .functype not_v8i16 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: not_v8i16:
+; NO-SIMD128:         .functype not_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $pop0
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push6=, 14
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $8, $pop20
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
+; NO-SIMD128-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 10
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $6, $pop18
+; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 6
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $4, $pop17
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: not_v8i16:
+; NO-SIMD128-FAST:         .functype not_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1,
                           i16 -1, i16 -1, i16 -1, i16 -1>
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: andnot_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype andnot_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.andnot $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: return
 define <8 x i16> @andnot_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: andnot_v8i16:
+; SIMD128:         .functype andnot_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.andnot $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: andnot_v8i16:
+; SIMD128-FAST:         .functype andnot_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push0=, $1
+; SIMD128-FAST-NEXT:    v128.and $push1=, $0, $pop0
+; SIMD128-FAST-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: andnot_v8i16:
+; NO-SIMD128:         .functype andnot_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-NEXT:    i32.xor $push1=, $13, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop1
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $11, $pop31
+; NO-SIMD128-NEXT:    i32.and $push4=, $3, $pop3
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $10, $pop30
+; NO-SIMD128-NEXT:    i32.and $push6=, $2, $pop5
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $9, $pop29
+; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop7
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push11=, 14
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $16, $pop28
+; NO-SIMD128-NEXT:    i32.and $push10=, $8, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push15=, 12
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $15, $pop27
+; NO-SIMD128-NEXT:    i32.and $push14=, $7, $pop13
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push19=, 10
+; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
+; NO-SIMD128-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $14, $pop26
+; NO-SIMD128-NEXT:    i32.and $push18=, $6, $pop17
+; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
+; NO-SIMD128-NEXT:    i32.const $push23=, 6
+; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
+; NO-SIMD128-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $12, $pop25
+; NO-SIMD128-NEXT:    i32.and $push22=, $4, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: andnot_v8i16:
+; NO-SIMD128-FAST:         .functype andnot_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $13, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <8 x i16> %y,
    <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
  %a = and <8 x i16> %x, %inv_y
  ret <8 x i16> %a
 }
 
-; CHECK-LABEL: bitselect_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_v8i16 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.or
-; SIMD128-FAST-NEXT: return
 define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
+; SIMD128-LABEL: bitselect_v8i16:
+; SIMD128:         .functype bitselect_v8i16 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $1, $2, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_v8i16:
+; SIMD128-FAST:         .functype bitselect_v8i16 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.and $push0=, $1, $0
+; SIMD128-FAST-NEXT:    v128.not $push2=, $0
+; SIMD128-FAST-NEXT:    v128.and $push3=, $2, $pop2
+; SIMD128-FAST-NEXT:    v128.or $push1=, $pop0, $pop3
+; SIMD128-FAST-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: bitselect_v8i16:
+; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
+; NO-SIMD128-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push11=, 12
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.and $push7=, $15, $7
+; NO-SIMD128-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop47
+; NO-SIMD128-NEXT:    i32.and $push9=, $23, $pop8
+; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push17=, 10
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.and $push13=, $14, $6
+; NO-SIMD128-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $6, $pop46
+; NO-SIMD128-NEXT:    i32.and $push15=, $22, $pop14
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.and $push19=, $13, $5
+; NO-SIMD128-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-NEXT:    i32.xor $push20=, $5, $pop45
+; NO-SIMD128-NEXT:    i32.and $push21=, $21, $pop20
+; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push27=, 6
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.and $push23=, $12, $4
+; NO-SIMD128-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-NEXT:    i32.xor $push24=, $4, $pop44
+; NO-SIMD128-NEXT:    i32.and $push25=, $20, $pop24
+; NO-SIMD128-NEXT:    i32.or $push26=, $pop23, $pop25
+; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.and $push29=, $11, $3
+; NO-SIMD128-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $3, $pop43
+; NO-SIMD128-NEXT:    i32.and $push31=, $19, $pop30
+; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push33=, $10, $2
+; NO-SIMD128-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-NEXT:    i32.xor $push34=, $2, $pop42
+; NO-SIMD128-NEXT:    i32.and $push35=, $18, $pop34
+; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push37=, $9, $1
+; NO-SIMD128-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-NEXT:    i32.xor $push38=, $1, $pop41
+; NO-SIMD128-NEXT:    i32.and $push39=, $17, $pop38
+; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
+; NO-SIMD128-FAST:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $9, $1
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $17, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $22, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $23, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $24, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
     <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>,
@@ -692,31 +9185,253 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: bitselect_xor_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_xor_v8i16 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.xor
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.xor
 define <8 x i16> @bitselect_xor_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
+; SIMD128-LABEL: bitselect_xor_v8i16:
+; SIMD128:         .functype bitselect_xor_v8i16 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $1, $2, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_xor_v8i16:
+; SIMD128-FAST:         .functype bitselect_xor_v8i16 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push2=, $1, $2
+; SIMD128-FAST-NEXT:    v128.and $push1=, $pop2, $0
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $pop1, $2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: bitselect_xor_v8i16:
+; NO-SIMD128:         .functype bitselect_xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push3=, 14
+; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
+; NO-SIMD128-NEXT:    i32.store16 0($pop4), $pop2
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $23
+; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $7
+; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $23
+; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.const $push13=, 10
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $22
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $6
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $22
+; NO-SIMD128-NEXT:    i32.store16 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $21
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $5
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-NEXT:    i32.const $push21=, 6
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $20
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $4
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $20
+; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push23=, $11, $19
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $3
+; NO-SIMD128-NEXT:    i32.xor $push25=, $pop24, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $18
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $2
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.xor $push29=, $9, $17
+; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $1
+; NO-SIMD128-NEXT:    i32.xor $push31=, $pop30, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_xor_v8i16:
+; NO-SIMD128-FAST:         .functype bitselect_xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $18
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $18
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $11, $19
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $12, $20
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $20
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %and = and <8 x i16> %xor1, %c
  %a = xor <8 x i16> %and, %v2
  ret <8 x i16> %a
 }
 
-; CHECK-LABEL: bitselect_xor_reversed_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_xor_reversed_v8i16 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $2, $1, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.xor
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.xor
 define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
+; SIMD128-LABEL: bitselect_xor_reversed_v8i16:
+; SIMD128:         .functype bitselect_xor_reversed_v8i16 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $2, $1, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_xor_reversed_v8i16:
+; SIMD128-FAST:         .functype bitselect_xor_reversed_v8i16 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push1=, $1, $2
+; SIMD128-FAST-NEXT:    v128.not $push2=, $0
+; SIMD128-FAST-NEXT:    v128.and $push3=, $pop1, $pop2
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $pop3, $2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: bitselect_xor_reversed_v8i16:
+; NO-SIMD128:         .functype bitselect_xor_reversed_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 14
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $24
+; NO-SIMD128-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $24
+; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push11=, 12
+; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
+; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $23
+; NO-SIMD128-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop47
+; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $23
+; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
+; NO-SIMD128-NEXT:    i32.const $push17=, 10
+; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $22
+; NO-SIMD128-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $6, $pop46
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $22
+; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $21
+; NO-SIMD128-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-NEXT:    i32.xor $push19=, $5, $pop45
+; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push27=, 6
+; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-NEXT:    i32.xor $push24=, $12, $20
+; NO-SIMD128-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $4, $pop44
+; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $pop23
+; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $20
+; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.xor $push30=, $11, $19
+; NO-SIMD128-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $3, $pop43
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push34=, $10, $18
+; NO-SIMD128-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-NEXT:    i32.xor $push33=, $2, $pop42
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.xor $push36=, $pop35, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.xor $push38=, $9, $17
+; NO-SIMD128-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-NEXT:    i32.xor $push37=, $1, $pop41
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v8i16:
+; NO-SIMD128-FAST:         .functype bitselect_xor_reversed_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $17
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $10, $18
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $18
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $11, $19
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $19
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $12, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $20
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %notc = xor <8 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1,
                             i16 -1, i16 -1, i16 -1, i16 -1>
@@ -725,12 +9440,110 @@ define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x
  ret <8 x i16> %a
 }
 
-; CHECK-LABEL: extmul_low_s_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extmul_low_s_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i16x8.extmul_low_i8x16_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
+; SIMD128-LABEL: extmul_low_s_v8i16:
+; SIMD128:         .functype extmul_low_s_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_low_s_v8i16:
+; SIMD128-FAST:         .functype extmul_low_s_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.extend_low_i8x16_s $push0=, $0
+; SIMD128-FAST-NEXT:    i16x8.extend_low_i8x16_s $push1=, $1
+; SIMD128-FAST-NEXT:    i16x8.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_low_s_v8i16:
+; NO-SIMD128:         .functype extmul_low_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $21
+; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $19
+; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $18
+; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $17
+; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 14
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $24
+; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push20=, 12
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $23
+; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
+; NO-SIMD128-NEXT:    i32.const $push25=, 10
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $22
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $20
+; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
+; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_low_s_v8i16:
+; NO-SIMD128-FAST:         .functype extmul_low_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push0=, $17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push4=, $2
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push3=, $18
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $pop4, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $3
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $19
+; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $20
+; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $21
+; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $22
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
+; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $24
+; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %low2 = shufflevector <16 x i8> %v2, <16 x i8> undef,
@@ -741,12 +9554,110 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: extmul_high_s_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extmul_high_s_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i16x8.extmul_high_i8x16_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
+; SIMD128-LABEL: extmul_high_s_v8i16:
+; SIMD128:         .functype extmul_high_s_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_high_s_v8i16:
+; SIMD128-FAST:         .functype extmul_high_s_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.extend_high_i8x16_s $push0=, $0
+; SIMD128-FAST-NEXT:    i16x8.extend_high_i8x16_s $push1=, $1
+; SIMD128-FAST-NEXT:    i16x8.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_high_s_v8i16:
+; NO-SIMD128:         .functype extmul_high_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $29
+; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $27
+; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $26
+; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $25
+; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push15=, 14
+; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $16
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $32
+; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.const $push20=, 12
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $31
+; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
+; NO-SIMD128-NEXT:    i32.const $push25=, 10
+; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $30
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
+; NO-SIMD128-NEXT:    i32.const $push30=, 6
+; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $28
+; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
+; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_high_s_v8i16:
+; NO-SIMD128-FAST:         .functype extmul_high_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push1=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push0=, $25
+; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push4=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push3=, $26
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $pop4, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $27
+; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $28
+; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $30
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $32
+; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %high2 = shufflevector <16 x i8> %v2, <16 x i8> undef,
@@ -757,12 +9668,142 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: extmul_low_u_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extmul_low_u_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i16x8.extmul_low_i8x16_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
+; SIMD128-LABEL: extmul_low_u_v8i16:
+; SIMD128:         .functype extmul_low_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_low_u_v8i16:
+; SIMD128-FAST:         .functype extmul_low_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.extend_low_i8x16_u $push0=, $0
+; SIMD128-FAST-NEXT:    i16x8.extend_low_i8x16_u $push1=, $1
+; SIMD128-FAST-NEXT:    i16x8.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_low_u_v8i16:
+; NO-SIMD128:         .functype extmul_low_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $21, $pop47
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push4=, $19, $pop45
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop43
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push10=, $17, $pop41
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push16=, 14
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $24, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push21=, 12
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push18=, $23, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push26=, 10
+; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $22, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
+; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-NEXT:    i32.const $push31=, 6
+; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $20, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_low_u_v8i16:
+; NO-SIMD128-FAST:         .functype extmul_low_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %low2 = shufflevector <16 x i8> %v2, <16 x i8> undef,
@@ -773,12 +9814,142 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
   ret <8 x i16> %a
 }
 
-; CHECK-LABEL: extmul_high_u_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extmul_high_u_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i16x8.extmul_high_i8x16_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
+; SIMD128-LABEL: extmul_high_u_v8i16:
+; SIMD128:         .functype extmul_high_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_high_u_v8i16:
+; SIMD128-FAST:         .functype extmul_high_u_v8i16 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i16x8.extend_high_i8x16_u $push0=, $0
+; SIMD128-FAST-NEXT:    i16x8.extend_high_i8x16_u $push1=, $1
+; SIMD128-FAST-NEXT:    i16x8.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_high_u_v8i16:
+; NO-SIMD128:         .functype extmul_high_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push2=, $13, $pop0
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $29, $pop47
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $11, $pop46
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push4=, $27, $pop45
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push8=, $10, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $26, $pop43
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $9, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push10=, $25, $pop41
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push16=, 14
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $16, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $32, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
+; NO-SIMD128-NEXT:    i32.const $push21=, 12
+; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $15, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push18=, $31, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
+; NO-SIMD128-NEXT:    i32.const $push26=, 10
+; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push24=, $14, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $30, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
+; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-NEXT:    i32.const $push31=, 6
+; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $12, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $28, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_high_u_v8i16:
+; NO-SIMD128-FAST:         .functype extmul_high_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $26, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $11, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $27, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $28, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $13, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $29, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $14, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $30, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $15, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $31, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
+; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $32, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %high2 = shufflevector <16 x i8> %v2, <16 x i8> undef,
@@ -792,108 +9963,540 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; ==============================================================================
 ; 4 x i32
 ; ==============================================================================
-; CHECK-LABEL: add_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype add_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.add $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: add_v4i32:
+; SIMD128:         .functype add_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.add $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: add_v4i32:
+; SIMD128-FAST:         .functype add_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.add $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: add_v4i32:
+; NO-SIMD128:         .functype add_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.add $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.add $push3=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: add_v4i32:
+; NO-SIMD128-FAST:         .functype add_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.add $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.add $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = add <4 x i32> %x, %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: sub_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype sub_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: sub_v4i32:
+; SIMD128:         .functype sub_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.sub $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: sub_v4i32:
+; SIMD128-FAST:         .functype sub_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.sub $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: sub_v4i32:
+; NO-SIMD128:         .functype sub_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.sub $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.sub $push3=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: sub_v4i32:
+; NO-SIMD128-FAST:         .functype sub_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.sub $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> %x, %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: mul_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype mul_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: mul_v4i32:
+; SIMD128:         .functype mul_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.mul $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: mul_v4i32:
+; SIMD128-FAST:         .functype mul_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.mul $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: mul_v4i32:
+; NO-SIMD128:         .functype mul_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.mul $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.mul $push3=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: mul_v4i32:
+; NO-SIMD128-FAST:         .functype mul_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.mul $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.mul $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = mul <4 x i32> %x, %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: min_s_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype min_s_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.min_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @min_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: min_s_v4i32:
+; SIMD128:         .functype min_s_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.min_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_s_v4i32:
+; SIMD128-FAST:         .functype min_s_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.min_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_s_v4i32:
+; NO-SIMD128:         .functype min_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.lt_s $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_s $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_s $push4=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_s_v4i32:
+; NO-SIMD128-FAST:         .functype min_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.select $push1=, $1, $5, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: min_u_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype min_u_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.min_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @min_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: min_u_v4i32:
+; SIMD128:         .functype min_u_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.min_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_u_v4i32:
+; SIMD128-FAST:         .functype min_u_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.min_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_u_v4i32:
+; NO-SIMD128:         .functype min_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.lt_u $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_u $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_u $push4=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.lt_u $push6=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_u_v4i32:
+; NO-SIMD128-FAST:         .functype min_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.select $push1=, $1, $5, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: max_s_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype max_s_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.max_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @max_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: max_s_v4i32:
+; SIMD128:         .functype max_s_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.max_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_s_v4i32:
+; SIMD128-FAST:         .functype max_s_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.max_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_s_v4i32:
+; NO-SIMD128:         .functype max_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.gt_s $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_s $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_s $push4=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_s_v4i32:
+; NO-SIMD128-FAST:         .functype max_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.select $push1=, $1, $5, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: max_u_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype max_u_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.max_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: max_u_v4i32:
+; SIMD128:         .functype max_u_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.max_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_u_v4i32:
+; SIMD128-FAST:         .functype max_u_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.max_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_u_v4i32:
+; NO-SIMD128:         .functype max_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.gt_u $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_u $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_u $push4=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.gt_u $push6=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_u_v4i32:
+; NO-SIMD128-FAST:         .functype max_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.select $push1=, $1, $5, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $2, $6, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: abs_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype abs_v4i32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.abs $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @abs_v4i32(<4 x i32> %x) {
+; SIMD128-LABEL: abs_v4i32:
+; SIMD128:         .functype abs_v4i32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.abs $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: abs_v4i32:
+; SIMD128-FAST:         .functype abs_v4i32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.abs $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: abs_v4i32:
+; NO-SIMD128:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-NEXT:    i32.const $push0=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push21=, $4, $pop0
+; NO-SIMD128-NEXT:    local.tee $push20=, $5=, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop1, $5
+; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
+; NO-SIMD128-NEXT:    i32.const $push19=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $3, $pop19
+; NO-SIMD128-NEXT:    local.tee $push17=, $4=, $pop18
+; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $pop17
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push16=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push15=, $2, $pop16
+; NO-SIMD128-NEXT:    local.tee $push14=, $4=, $pop15
+; NO-SIMD128-NEXT:    i32.xor $push7=, $2, $pop14
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push13=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push12=, $1, $pop13
+; NO-SIMD128-NEXT:    local.tee $push11=, $4=, $pop12
+; NO-SIMD128-NEXT:    i32.xor $push9=, $1, $pop11
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop9, $4
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop10
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: abs_v4i32:
+; NO-SIMD128-FAST:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push20=, $5=, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $2, $pop19
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $1=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop3, $1
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $3, $pop16
+; NO-SIMD128-FAST-NEXT:    local.tee $push14=, $2=, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $3, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $2
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $4, $pop13
+; NO-SIMD128-FAST-NEXT:    local.tee $push11=, $0=, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $4, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop7, $0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> zeroinitializer, %x
   %b = icmp slt <4 x i32> %x, zeroinitializer
   %c = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %x
   ret <4 x i32> %c
 }
 
-; CHECK-LABEL: neg_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype neg_v4i32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.neg $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @neg_v4i32(<4 x i32> %x) {
+; SIMD128-LABEL: neg_v4i32:
+; SIMD128:         .functype neg_v4i32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.neg $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: neg_v4i32:
+; SIMD128-FAST:         .functype neg_v4i32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.neg $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: neg_v4i32:
+; NO-SIMD128:         .functype neg_v4i32 (i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $3
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop9, $2
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push8=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop8, $1
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 12
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.const $push7=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop7, $4
+; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: neg_v4i32:
+; NO-SIMD128-FAST:         .functype neg_v4i32 (i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop9, $2
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop8, $3
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop7, $4
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %x
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: shl_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype shl_v4i32 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.shl $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
+; SIMD128-LABEL: shl_v4i32:
+; SIMD128:         .functype shl_v4i32 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.shl $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shl_v4i32:
+; SIMD128-FAST:         .functype shl_v4i32 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.shl $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_v4i32:
+; NO-SIMD128:         .functype shl_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_v4i32:
+; NO-SIMD128-FAST:         .functype shl_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.shl $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
     <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -901,41 +10504,180 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: shl_const_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype shl_const_v4i32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5
-; SIMD128-NEXT: i32x4.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
+; SIMD128-LABEL: shl_const_v4i32:
+; SIMD128:         .functype shl_const_v4i32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32.const $push0=, 5
+; SIMD128-NEXT:    i32x4.shl $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: shl_const_v4i32:
+; SIMD128-FAST:         .functype shl_const_v4i32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32.const $push1=, 5
+; SIMD128-FAST-NEXT:    i32x4.shl $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_const_v4i32:
+; NO-SIMD128:         .functype shl_const_v4i32 (i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 5
+; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $pop0
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $pop9
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push8=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $pop8
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 12
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $4, $pop7
+; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_const_v4i32:
+; NO-SIMD128-FAST:         .functype shl_const_v4i32 (i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: shl_vec_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype shl_vec_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
-; Skip 2 lanes
-; SIMD128:      i32x4.extract_lane $push[[L4:[0-9]+]]=, $0, 3{{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[L5:[0-9]+]]=, $1, 3{{$}}
-; SIMD128-NEXT: i32.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
-; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 3, $pop[[L6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
+; SIMD128-LABEL: shl_vec_v4i32:
+; SIMD128:         .functype shl_vec_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extract_lane $push4=, $0, 0
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $1, 0
+; SIMD128-NEXT:    i32.shl $push5=, $pop4, $pop3
+; SIMD128-NEXT:    i32x4.splat $push6=, $pop5
+; SIMD128-NEXT:    i32x4.extract_lane $push1=, $0, 1
+; SIMD128-NEXT:    i32x4.extract_lane $push0=, $1, 1
+; SIMD128-NEXT:    i32.shl $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i32x4.replace_lane $push7=, $pop6, 1, $pop2
+; SIMD128-NEXT:    i32x4.extract_lane $push9=, $0, 2
+; SIMD128-NEXT:    i32x4.extract_lane $push8=, $1, 2
+; SIMD128-NEXT:    i32.shl $push10=, $pop9, $pop8
+; SIMD128-NEXT:    i32x4.replace_lane $push11=, $pop7, 2, $pop10
+; SIMD128-NEXT:    i32x4.extract_lane $push13=, $0, 3
+; SIMD128-NEXT:    i32x4.extract_lane $push12=, $1, 3
+; SIMD128-NEXT:    i32.shl $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i32x4.replace_lane $push15=, $pop11, 3, $pop14
+; SIMD128-NEXT:    return $pop15
+;
+; SIMD128-FAST-LABEL: shl_vec_v4i32:
+; SIMD128-FAST:         .functype shl_vec_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push5=, $0, 0
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push4=, $1, 0
+; SIMD128-FAST-NEXT:    i32.shl $push6=, $pop5, $pop4
+; SIMD128-FAST-NEXT:    i32x4.splat $push7=, $pop6
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push2=, $0, 1
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.shl $push3=, $pop2, $pop1
+; SIMD128-FAST-NEXT:    i32x4.replace_lane $push8=, $pop7, 1, $pop3
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push10=, $0, 2
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push9=, $1, 2
+; SIMD128-FAST-NEXT:    i32.shl $push11=, $pop10, $pop9
+; SIMD128-FAST-NEXT:    i32x4.replace_lane $push12=, $pop8, 2, $pop11
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push14=, $0, 3
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push13=, $1, 3
+; SIMD128-FAST-NEXT:    i32.shl $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i32x4.replace_lane $push0=, $pop12, 3, $pop15
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_vec_v4i32:
+; NO-SIMD128:         .functype shl_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_vec_v4i32:
+; NO-SIMD128-FAST:         .functype shl_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.shl $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, %x
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: shr_s_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype shr_s_v4i32 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.shr_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
+; SIMD128-LABEL: shr_s_v4i32:
+; SIMD128:         .functype shr_s_v4i32 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.shr_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_s_v4i32:
+; SIMD128-FAST:         .functype shr_s_v4i32 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.shr_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_v4i32:
+; NO-SIMD128:         .functype shr_s_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_v4i32:
+; NO-SIMD128-FAST:         .functype shr_s_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push1=, $2, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
     <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -943,30 +10685,124 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: shr_s_vec_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype shr_s_vec_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i32.shr_s $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
-; Skip 2 lanes
-; SIMD128:      i32x4.extract_lane $push[[L4:[0-9]+]]=, $0, 3{{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[L5:[0-9]+]]=, $1, 3{{$}}
-; SIMD128-NEXT: i32.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
-; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 3, $pop[[L6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
+; SIMD128-LABEL: shr_s_vec_v4i32:
+; SIMD128:         .functype shr_s_vec_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extract_lane $push4=, $0, 0
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $1, 0
+; SIMD128-NEXT:    i32.shr_s $push5=, $pop4, $pop3
+; SIMD128-NEXT:    i32x4.splat $push6=, $pop5
+; SIMD128-NEXT:    i32x4.extract_lane $push1=, $0, 1
+; SIMD128-NEXT:    i32x4.extract_lane $push0=, $1, 1
+; SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i32x4.replace_lane $push7=, $pop6, 1, $pop2
+; SIMD128-NEXT:    i32x4.extract_lane $push9=, $0, 2
+; SIMD128-NEXT:    i32x4.extract_lane $push8=, $1, 2
+; SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $pop8
+; SIMD128-NEXT:    i32x4.replace_lane $push11=, $pop7, 2, $pop10
+; SIMD128-NEXT:    i32x4.extract_lane $push13=, $0, 3
+; SIMD128-NEXT:    i32x4.extract_lane $push12=, $1, 3
+; SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i32x4.replace_lane $push15=, $pop11, 3, $pop14
+; SIMD128-NEXT:    return $pop15
+;
+; SIMD128-FAST-LABEL: shr_s_vec_v4i32:
+; SIMD128-FAST:         .functype shr_s_vec_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push5=, $0, 0
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push4=, $1, 0
+; SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
+; SIMD128-FAST-NEXT:    i32x4.splat $push7=, $pop6
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push2=, $0, 1
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
+; SIMD128-FAST-NEXT:    i32x4.replace_lane $push8=, $pop7, 1, $pop3
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push10=, $0, 2
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push9=, $1, 2
+; SIMD128-FAST-NEXT:    i32.shr_s $push11=, $pop10, $pop9
+; SIMD128-FAST-NEXT:    i32x4.replace_lane $push12=, $pop8, 2, $pop11
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push14=, $0, 3
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push13=, $1, 3
+; SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i32x4.replace_lane $push0=, $pop12, 3, $pop15
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_vec_v4i32:
+; NO-SIMD128:         .functype shr_s_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_vec_v4i32:
+; NO-SIMD128-FAST:         .functype shr_s_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <4 x i32> %v, %x
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: shr_u_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype shr_u_v4i32 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.shr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
+; SIMD128-LABEL: shr_u_v4i32:
+; SIMD128:         .functype shr_u_v4i32 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.shr_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_u_v4i32:
+; SIMD128-FAST:         .functype shr_u_v4i32 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.shr_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_v4i32:
+; NO-SIMD128:         .functype shr_u_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_v4i32:
+; NO-SIMD128-FAST:         .functype shr_u_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push1=, $2, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
     <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -974,89 +10810,415 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: shr_u_vec_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype shr_u_vec_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
-; Skip 2 lanes
-; SIMD128:      i32x4.extract_lane $push[[L4:[0-9]+]]=, $0, 3{{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[L5:[0-9]+]]=, $1, 3{{$}}
-; SIMD128-NEXT: i32.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
-; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 3, $pop[[L6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
+; SIMD128-LABEL: shr_u_vec_v4i32:
+; SIMD128:         .functype shr_u_vec_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extract_lane $push4=, $0, 0
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $1, 0
+; SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop3
+; SIMD128-NEXT:    i32x4.splat $push6=, $pop5
+; SIMD128-NEXT:    i32x4.extract_lane $push1=, $0, 1
+; SIMD128-NEXT:    i32x4.extract_lane $push0=, $1, 1
+; SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i32x4.replace_lane $push7=, $pop6, 1, $pop2
+; SIMD128-NEXT:    i32x4.extract_lane $push9=, $0, 2
+; SIMD128-NEXT:    i32x4.extract_lane $push8=, $1, 2
+; SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $pop8
+; SIMD128-NEXT:    i32x4.replace_lane $push11=, $pop7, 2, $pop10
+; SIMD128-NEXT:    i32x4.extract_lane $push13=, $0, 3
+; SIMD128-NEXT:    i32x4.extract_lane $push12=, $1, 3
+; SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $pop12
+; SIMD128-NEXT:    i32x4.replace_lane $push15=, $pop11, 3, $pop14
+; SIMD128-NEXT:    return $pop15
+;
+; SIMD128-FAST-LABEL: shr_u_vec_v4i32:
+; SIMD128-FAST:         .functype shr_u_vec_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push5=, $0, 0
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push4=, $1, 0
+; SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; SIMD128-FAST-NEXT:    i32x4.splat $push7=, $pop6
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push2=, $0, 1
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; SIMD128-FAST-NEXT:    i32x4.replace_lane $push8=, $pop7, 1, $pop3
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push10=, $0, 2
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push9=, $1, 2
+; SIMD128-FAST-NEXT:    i32.shr_u $push11=, $pop10, $pop9
+; SIMD128-FAST-NEXT:    i32x4.replace_lane $push12=, $pop8, 2, $pop11
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push14=, $0, 3
+; SIMD128-FAST-NEXT:    i32x4.extract_lane $push13=, $1, 3
+; SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; SIMD128-FAST-NEXT:    i32x4.replace_lane $push0=, $pop12, 3, $pop15
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_vec_v4i32:
+; NO-SIMD128:         .functype shr_u_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_vec_v4i32:
+; NO-SIMD128-FAST:         .functype shr_u_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <4 x i32> %v, %x
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: and_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype and_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: and_v4i32:
+; SIMD128:         .functype and_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: and_v4i32:
+; SIMD128-FAST:         .functype and_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: and_v4i32:
+; NO-SIMD128:         .functype and_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.and $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.and $push3=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: and_v4i32:
+; NO-SIMD128-FAST:         .functype and_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = and <4 x i32> %x, %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: or_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype or_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: or_v4i32:
+; SIMD128:         .functype or_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.or $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: or_v4i32:
+; SIMD128-FAST:         .functype or_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.or $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: or_v4i32:
+; NO-SIMD128:         .functype or_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.or $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.or $push3=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: or_v4i32:
+; NO-SIMD128-FAST:         .functype or_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.or $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.or $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = or <4 x i32> %x, %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: xor_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype xor_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.xor $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: xor_v4i32:
+; SIMD128:         .functype xor_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.xor $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: xor_v4i32:
+; SIMD128-FAST:         .functype xor_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: xor_v4i32:
+; NO-SIMD128:         .functype xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.xor $push0=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.xor $push3=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: xor_v4i32:
+; NO-SIMD128-FAST:         .functype xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, %y
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: not_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype not_v4i32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @not_v4i32(<4 x i32> %x) {
+; SIMD128-LABEL: not_v4i32:
+; SIMD128:         .functype not_v4i32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.not $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: not_v4i32:
+; SIMD128-FAST:         .functype not_v4i32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: not_v4i32:
+; NO-SIMD128:         .functype not_v4i32 (i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $pop0
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $pop9
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push8=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $pop8
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 12
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.const $push7=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $4, $pop7
+; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: not_v4i32:
+; NO-SIMD128-FAST:         .functype not_v4i32 (i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: andnot_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype andnot_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.andnot $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: return
 define <4 x i32> @andnot_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: andnot_v4i32:
+; SIMD128:         .functype andnot_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.andnot $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: andnot_v4i32:
+; SIMD128-FAST:         .functype andnot_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push0=, $1
+; SIMD128-FAST-NEXT:    v128.and $push1=, $0, $pop0
+; SIMD128-FAST-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: andnot_v4i32:
+; NO-SIMD128:         .functype andnot_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-NEXT:    i32.xor $push1=, $7, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop1
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $6, $pop13
+; NO-SIMD128-NEXT:    i32.and $push4=, $2, $pop3
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $5, $pop12
+; NO-SIMD128-NEXT:    i32.and $push6=, $1, $pop5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $8, $pop11
+; NO-SIMD128-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: andnot_v4i32:
+; NO-SIMD128-FAST:         .functype andnot_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $5, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $7, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
  %a = and <4 x i32> %x, %inv_y
  ret <4 x i32> %a
 }
 
-; CHECK-LABEL: bitselect_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_v4i32 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.or
-; SIMD128-FAST-NEXT: return
 define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
+; SIMD128-LABEL: bitselect_v4i32:
+; SIMD128:         .functype bitselect_v4i32 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $1, $2, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_v4i32:
+; SIMD128-FAST:         .functype bitselect_v4i32 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push2=, $0
+; SIMD128-FAST-NEXT:    v128.and $push3=, $pop2, $2
+; SIMD128-FAST-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-FAST-NEXT:    v128.or $push1=, $pop3, $pop0
+; SIMD128-FAST-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: bitselect_v4i32:
+; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 12
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
+; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
+; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
+; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $3, $pop21
+; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $11
+; NO-SIMD128-NEXT:    i32.and $push7=, $3, $7
+; NO-SIMD128-NEXT:    i32.or $push10=, $pop9, $pop7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push12=, $2, $pop20
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $10
+; NO-SIMD128-NEXT:    i32.and $push11=, $2, $6
+; NO-SIMD128-NEXT:    i32.or $push14=, $pop13, $pop11
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push16=, $1, $pop19
+; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $9
+; NO-SIMD128-NEXT:    i32.and $push15=, $1, $5
+; NO-SIMD128-NEXT:    i32.or $push18=, $pop17, $pop15
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
+; NO-SIMD128-FAST:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $9
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
   %masked_v2 = and <4 x i32> %inv_mask, %v2
@@ -1064,31 +11226,149 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: bitselect_xor_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_xor_v4i32 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.xor
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.xor
 define <4 x i32> @bitselect_xor_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
+; SIMD128-LABEL: bitselect_xor_v4i32:
+; SIMD128:         .functype bitselect_xor_v4i32 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $1, $2, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_xor_v4i32:
+; SIMD128-FAST:         .functype bitselect_xor_v4i32 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push2=, $1, $2
+; SIMD128-FAST-NEXT:    v128.and $push1=, $pop2, $0
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $pop1, $2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: bitselect_xor_v4i32:
+; NO-SIMD128:         .functype bitselect_xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
+; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $11
+; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $3
+; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.xor $push8=, $6, $10
+; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $2
+; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop10
+; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $9
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $pop12, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop13
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_xor_v4i32:
+; NO-SIMD128-FAST:         .functype bitselect_xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $10
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $10
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $11
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $8, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $12
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %and = and <4 x i32> %xor1, %c
  %a = xor <4 x i32> %and, %v2
  ret <4 x i32> %a
 }
 
-; CHECK-LABEL: bitselect_xor_reversed_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_xor_reversed_v4i32 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $2, $1, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.xor
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.xor
 define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
+; SIMD128-LABEL: bitselect_xor_reversed_v4i32:
+; SIMD128:         .functype bitselect_xor_reversed_v4i32 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $2, $1, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_xor_reversed_v4i32:
+; SIMD128-FAST:         .functype bitselect_xor_reversed_v4i32 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push1=, $1, $2
+; SIMD128-FAST-NEXT:    v128.not $push2=, $0
+; SIMD128-FAST-NEXT:    v128.and $push3=, $pop1, $pop2
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $pop3, $2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: bitselect_xor_reversed_v4i32:
+; NO-SIMD128:         .functype bitselect_xor_reversed_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push5=, 12
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $12
+; NO-SIMD128-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $12
+; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $11
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $3, $pop21
+; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
+; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $10
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $2, $pop20
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $pop11
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push16=, $5, $9
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $1, $pop19
+; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $pop15
+; NO-SIMD128-NEXT:    i32.xor $push18=, $pop17, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v4i32:
+; NO-SIMD128-FAST:         .functype bitselect_xor_reversed_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $9
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $10
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $7, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $11
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $8, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $12
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %notc = xor <4 x i32> %c, <i32 -1, i32 -1, i32 -1, i32 -1>
  %and = and <4 x i32> %xor1, %notc
@@ -1096,12 +11376,66 @@ define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x
  ret <4 x i32> %a
 }
 
-; CHECK-LABEL: extmul_low_s_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype extmul_low_s_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i32x4.extmul_low_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
+; SIMD128-LABEL: extmul_low_s_v4i32:
+; SIMD128:         .functype extmul_low_s_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extmul_low_i16x8_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_low_s_v4i32:
+; SIMD128-FAST:         .functype extmul_low_s_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.extend_low_i16x8_s $push0=, $0
+; SIMD128-FAST-NEXT:    i32x4.extend_low_i16x8_s $push1=, $1
+; SIMD128-FAST-NEXT:    i32x4.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_low_s_v4i32:
+; NO-SIMD128:         .functype extmul_low_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $11
+; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $10
+; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $9
+; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 12
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $12
+; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_low_s_v4i32:
+; NO-SIMD128-FAST:         .functype extmul_low_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push0=, $9
+; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push4=, $2
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push3=, $10
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $pop4, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $3
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $11
+; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $12
+; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %low2 = shufflevector <8 x i16> %v2, <8 x i16> undef,
@@ -1112,12 +11446,66 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: extmul_high_s_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype extmul_high_s_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i32x4.extmul_high_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
+; SIMD128-LABEL: extmul_high_s_v4i32:
+; SIMD128:         .functype extmul_high_s_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extmul_high_i16x8_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_high_s_v4i32:
+; SIMD128-FAST:         .functype extmul_high_s_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.extend_high_i16x8_s $push0=, $0
+; SIMD128-FAST-NEXT:    i32x4.extend_high_i16x8_s $push1=, $1
+; SIMD128-FAST-NEXT:    i32x4.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_high_s_v4i32:
+; NO-SIMD128:         .functype extmul_high_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $15
+; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $14
+; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $13
+; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push12=, 12
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $8
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $16
+; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_high_s_v4i32:
+; NO-SIMD128-FAST:         .functype extmul_high_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push1=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push0=, $13
+; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push4=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push3=, $14
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $pop4, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $16
+; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %high2 = shufflevector <8 x i16> %v2, <8 x i16> undef,
@@ -1128,12 +11516,82 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: extmul_low_u_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype extmul_low_u_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i32x4.extmul_low_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
+; SIMD128-LABEL: extmul_low_u_v4i32:
+; SIMD128:         .functype extmul_low_u_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extmul_low_i16x8_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_low_u_v4i32:
+; SIMD128-FAST:         .functype extmul_low_u_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.extend_low_i16x8_u $push0=, $0
+; SIMD128-FAST-NEXT:    i32x4.extend_low_i16x8_u $push1=, $1
+; SIMD128-FAST-NEXT:    i32x4.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_low_u_v4i32:
+; NO-SIMD128:         .functype extmul_low_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop0
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $11, $pop21
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop20
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push4=, $10, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop18
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push13=, 12
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.const $push16=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-NEXT:    i32.const $push15=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_low_u_v4i32:
+; NO-SIMD128-FAST:         .functype extmul_low_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %low2 = shufflevector <8 x i16> %v2, <8 x i16> undef,
@@ -1144,12 +11602,82 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: extmul_high_u_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype extmul_high_u_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i32x4.extmul_high_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
+; SIMD128-LABEL: extmul_high_u_v4i32:
+; SIMD128:         .functype extmul_high_u_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extmul_high_i16x8_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_high_u_v4i32:
+; SIMD128-FAST:         .functype extmul_high_u_v4i32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32x4.extend_high_i16x8_u $push0=, $0
+; SIMD128-FAST-NEXT:    i32x4.extend_high_i16x8_u $push1=, $1
+; SIMD128-FAST-NEXT:    i32x4.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_high_u_v4i32:
+; NO-SIMD128:         .functype extmul_high_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push2=, $7, $pop0
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $15, $pop21
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $6, $pop20
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push4=, $14, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push8=, $5, $pop18
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push13=, 12
+; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.const $push16=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-NEXT:    i32.const $push15=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_high_u_v4i32:
+; NO-SIMD128-FAST:         .functype extmul_high_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $5, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $13, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $7, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $15, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %high2 = shufflevector <8 x i16> %v2, <8 x i16> undef,
@@ -1163,64 +11691,232 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; ==============================================================================
 ; 2 x i64
 ; ==============================================================================
-; CHECK-LABEL: add_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype add_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.add $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @add_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: add_v2i64:
+; SIMD128:         .functype add_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.add $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: add_v2i64:
+; SIMD128-FAST:         .functype add_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.add $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: add_v2i64:
+; NO-SIMD128:         .functype add_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.add $push0=, $2, $4
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.add $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: add_v2i64:
+; NO-SIMD128-FAST:         .functype add_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.add $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.add $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = add <2 x i64> %x, %y
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: sub_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype sub_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @sub_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: sub_v2i64:
+; SIMD128:         .functype sub_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.sub $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: sub_v2i64:
+; SIMD128-FAST:         .functype sub_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.sub $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: sub_v2i64:
+; NO-SIMD128:         .functype sub_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.sub $push0=, $2, $4
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.sub $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: sub_v2i64:
+; NO-SIMD128-FAST:         .functype sub_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.sub $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.sub $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <2 x i64> %x, %y
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: mul_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype mul_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128: i64x2.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @mul_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: mul_v2i64:
+; SIMD128:         .functype mul_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.mul $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: mul_v2i64:
+; SIMD128-FAST:         .functype mul_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.mul $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: mul_v2i64:
+; NO-SIMD128:         .functype mul_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.mul $push0=, $2, $4
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.mul $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: mul_v2i64:
+; NO-SIMD128-FAST:         .functype mul_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.mul $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.mul $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = mul <2 x i64> %x, %y
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: abs_v2i64:
-; NO-SIMD128-NOT: i64x2:
-; SIMD128-NEXT: .functype abs_v2i64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.abs $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @abs_v2i64(<2 x i64> %x) {
+; SIMD128-LABEL: abs_v2i64:
+; SIMD128:         .functype abs_v2i64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.abs $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: abs_v2i64:
+; SIMD128-FAST:         .functype abs_v2i64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.abs $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: abs_v2i64:
+; NO-SIMD128:         .functype abs_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 63
+; NO-SIMD128-NEXT:    i64.shr_s $push9=, $2, $pop0
+; NO-SIMD128-NEXT:    local.tee $push8=, $3=, $pop9
+; NO-SIMD128-NEXT:    i64.xor $push1=, $2, $pop8
+; NO-SIMD128-NEXT:    i64.sub $push2=, $pop1, $3
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.const $push7=, 63
+; NO-SIMD128-NEXT:    i64.shr_s $push6=, $1, $pop7
+; NO-SIMD128-NEXT:    local.tee $push5=, $2=, $pop6
+; NO-SIMD128-NEXT:    i64.xor $push3=, $1, $pop5
+; NO-SIMD128-NEXT:    i64.sub $push4=, $pop3, $2
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop4
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: abs_v2i64:
+; NO-SIMD128-FAST:         .functype abs_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, 63
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push9=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push8=, $3=, $pop9
+; NO-SIMD128-FAST-NEXT:    i64.xor $push1=, $1, $pop8
+; NO-SIMD128-FAST-NEXT:    i64.sub $push2=, $pop1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.const $push7=, 63
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push6=, $2, $pop7
+; NO-SIMD128-FAST-NEXT:    local.tee $push5=, $1=, $pop6
+; NO-SIMD128-FAST-NEXT:    i64.xor $push3=, $2, $pop5
+; NO-SIMD128-FAST-NEXT:    i64.sub $push4=, $pop3, $1
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <2 x i64> zeroinitializer, %x
   %b = icmp slt <2 x i64> %x, zeroinitializer
   %c = select <2 x i1> %b, <2 x i64> %a, <2 x i64> %x
   ret <2 x i64> %c
 }
 
-; CHECK-LABEL: neg_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype neg_v2i64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.neg $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @neg_v2i64(<2 x i64> %x) {
+; SIMD128-LABEL: neg_v2i64:
+; SIMD128:         .functype neg_v2i64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.neg $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: neg_v2i64:
+; SIMD128-FAST:         .functype neg_v2i64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.neg $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: neg_v2i64:
+; NO-SIMD128:         .functype neg_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 0
+; NO-SIMD128-NEXT:    i64.sub $push1=, $pop0, $2
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i64.const $push3=, 0
+; NO-SIMD128-NEXT:    i64.sub $push2=, $pop3, $1
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: neg_v2i64:
+; NO-SIMD128-FAST:         .functype neg_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, 0
+; NO-SIMD128-FAST-NEXT:    i64.sub $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i64.const $push3=, 0
+; NO-SIMD128-FAST-NEXT:    i64.sub $push2=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    return
   %a = sub <2 x i64> <i64 0, i64 0>, %x
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shl_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shl_v2i64 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shl_v2i64(<2 x i64> %v, i32 %x) {
+; SIMD128-LABEL: shl_v2i64:
+; SIMD128:         .functype shl_v2i64 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.shl $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shl_v2i64:
+; SIMD128-FAST:         .functype shl_v2i64 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.shl $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_v2i64:
+; NO-SIMD128:         .functype shl_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push3=, $3
+; NO-SIMD128-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-NEXT:    i64.shl $push0=, $2, $pop2
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shl $push1=, $1, $4
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_v2i64:
+; NO-SIMD128-FAST:         .functype shl_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push3=, $3
+; NO-SIMD128-FAST-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.shl $push0=, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shl $push1=, $1, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %x2 = zext i32 %x to i64
   %t = insertelement <2 x i64> undef, i64 %x2, i32 0
   %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
@@ -1228,12 +11924,40 @@ define <2 x i64> @shl_v2i64(<2 x i64> %v, i32 %x) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shl_sext_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shl_sext_v2i64 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shl_sext_v2i64(<2 x i64> %v, i32 %x) {
+; SIMD128-LABEL: shl_sext_v2i64:
+; SIMD128:         .functype shl_sext_v2i64 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.shl $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shl_sext_v2i64:
+; SIMD128-FAST:         .functype shl_sext_v2i64 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.shl $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_sext_v2i64:
+; NO-SIMD128:         .functype shl_sext_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push3=, $3
+; NO-SIMD128-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-NEXT:    i64.shl $push0=, $2, $pop2
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shl $push1=, $1, $4
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_sext_v2i64:
+; NO-SIMD128-FAST:         .functype shl_sext_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push3=, $3
+; NO-SIMD128-FAST-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.shl $push0=, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shl $push1=, $1, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %x2 = sext i32 %x to i64
   %t = insertelement <2 x i64> undef, i64 %x2, i32 0
   %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
@@ -1241,53 +11965,166 @@ define <2 x i64> @shl_sext_v2i64(<2 x i64> %v, i32 %x) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shl_noext_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shl_noext_v2i64 (v128, i64) -> (v128){{$}}
-; SIMD128-NEXT: i32.wrap_i64 $push[[L0:[0-9]+]]=, $1{{$}}
-; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shl_noext_v2i64(<2 x i64> %v, i64 %x) {
+; SIMD128-LABEL: shl_noext_v2i64:
+; SIMD128:         .functype shl_noext_v2i64 (v128, i64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32.wrap_i64 $push0=, $1
+; SIMD128-NEXT:    i64x2.shl $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: shl_noext_v2i64:
+; SIMD128-FAST:         .functype shl_noext_v2i64 (v128, i64) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32.wrap_i64 $push1=, $1
+; SIMD128-FAST-NEXT:    i64x2.shl $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_noext_v2i64:
+; NO-SIMD128:         .functype shl_noext_v2i64 (i32, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.shl $push0=, $2, $3
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shl $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_noext_v2i64:
+; NO-SIMD128-FAST:         .functype shl_noext_v2i64 (i32, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.shl $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shl $push1=, $2, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <2 x i64> undef, i64 %x, i32 0
   %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %a = shl <2 x i64> %v, %s
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shl_const_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shl_const_v2i64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
-; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shl_const_v2i64(<2 x i64> %v) {
+; SIMD128-LABEL: shl_const_v2i64:
+; SIMD128:         .functype shl_const_v2i64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32.const $push0=, 5
+; SIMD128-NEXT:    i64x2.shl $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: shl_const_v2i64:
+; SIMD128-FAST:         .functype shl_const_v2i64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32.const $push1=, 5
+; SIMD128-FAST-NEXT:    i64x2.shl $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_const_v2i64:
+; NO-SIMD128:         .functype shl_const_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 5
+; NO-SIMD128-NEXT:    i64.shl $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i64.const $push3=, 5
+; NO-SIMD128-NEXT:    i64.shl $push2=, $1, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_const_v2i64:
+; NO-SIMD128-FAST:         .functype shl_const_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, 5
+; NO-SIMD128-FAST-NEXT:    i64.shl $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i64.const $push3=, 5
+; NO-SIMD128-FAST-NEXT:    i64.shl $push2=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    return
   %a = shl <2 x i64> %v, <i64 5, i64 5>
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shl_vec_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shl_vec_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i64.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L4:[0-9]+]]=, $0, 1{{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L5:[0-9]+]]=, $1, 1{{$}}
-; SIMD128-NEXT: i64.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
-; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L3]], 1, $pop[[L6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shl_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
+; SIMD128-LABEL: shl_vec_v2i64:
+; SIMD128:         .functype shl_vec_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extract_lane $push4=, $0, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push3=, $1, 0
+; SIMD128-NEXT:    i64.shl $push5=, $pop4, $pop3
+; SIMD128-NEXT:    i64x2.splat $push6=, $pop5
+; SIMD128-NEXT:    i64x2.extract_lane $push1=, $0, 1
+; SIMD128-NEXT:    i64x2.extract_lane $push0=, $1, 1
+; SIMD128-NEXT:    i64.shl $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i64x2.replace_lane $push7=, $pop6, 1, $pop2
+; SIMD128-NEXT:    return $pop7
+;
+; SIMD128-FAST-LABEL: shl_vec_v2i64:
+; SIMD128-FAST:         .functype shl_vec_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push5=, $0, 0
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push4=, $1, 0
+; SIMD128-FAST-NEXT:    i64.shl $push6=, $pop5, $pop4
+; SIMD128-FAST-NEXT:    i64x2.splat $push7=, $pop6
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push2=, $0, 1
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i64.shl $push3=, $pop2, $pop1
+; SIMD128-FAST-NEXT:    i64x2.replace_lane $push0=, $pop7, 1, $pop3
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shl_vec_v2i64:
+; NO-SIMD128:         .functype shl_vec_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.shl $push0=, $2, $4
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shl $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shl_vec_v2i64:
+; NO-SIMD128-FAST:         .functype shl_vec_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.shl $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shl $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = shl <2 x i64> %v, %x
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_s_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_s_v2i64 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_s_v2i64(<2 x i64> %v, i32 %x) {
+; SIMD128-LABEL: shr_s_v2i64:
+; SIMD128:         .functype shr_s_v2i64 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.shr_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_s_v2i64:
+; SIMD128-FAST:         .functype shr_s_v2i64 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.shr_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_v2i64:
+; NO-SIMD128:         .functype shr_s_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push3=, $3
+; NO-SIMD128-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-NEXT:    i64.shr_s $push0=, $2, $pop2
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shr_s $push1=, $1, $4
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_v2i64:
+; NO-SIMD128-FAST:         .functype shr_s_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push3=, $3
+; NO-SIMD128-FAST-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push0=, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push1=, $1, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %x2 = zext i32 %x to i64
   %t = insertelement <2 x i64> undef, i64 %x2, i32 0
   %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
@@ -1295,12 +12132,40 @@ define <2 x i64> @shr_s_v2i64(<2 x i64> %v, i32 %x) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_s_sext_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_s_sext_v2i64 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_s_sext_v2i64(<2 x i64> %v, i32 %x) {
+; SIMD128-LABEL: shr_s_sext_v2i64:
+; SIMD128:         .functype shr_s_sext_v2i64 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.shr_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_s_sext_v2i64:
+; SIMD128-FAST:         .functype shr_s_sext_v2i64 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.shr_s $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_sext_v2i64:
+; NO-SIMD128:         .functype shr_s_sext_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push3=, $3
+; NO-SIMD128-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-NEXT:    i64.shr_s $push0=, $2, $pop2
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shr_s $push1=, $1, $4
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_sext_v2i64:
+; NO-SIMD128-FAST:         .functype shr_s_sext_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push3=, $3
+; NO-SIMD128-FAST-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push0=, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push1=, $1, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %x2 = sext i32 %x to i64
   %t = insertelement <2 x i64> undef, i64 %x2, i32 0
   %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
@@ -1308,53 +12173,166 @@ define <2 x i64> @shr_s_sext_v2i64(<2 x i64> %v, i32 %x) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_s_noext_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_s_noext_v2i64 (v128, i64) -> (v128){{$}}
-; SIMD128-NEXT: i32.wrap_i64 $push[[L0:[0-9]+]]=, $1{{$}}
-; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_s_noext_v2i64(<2 x i64> %v, i64 %x) {
+; SIMD128-LABEL: shr_s_noext_v2i64:
+; SIMD128:         .functype shr_s_noext_v2i64 (v128, i64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32.wrap_i64 $push0=, $1
+; SIMD128-NEXT:    i64x2.shr_s $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: shr_s_noext_v2i64:
+; SIMD128-FAST:         .functype shr_s_noext_v2i64 (v128, i64) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32.wrap_i64 $push1=, $1
+; SIMD128-FAST-NEXT:    i64x2.shr_s $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_noext_v2i64:
+; NO-SIMD128:         .functype shr_s_noext_v2i64 (i32, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.shr_s $push0=, $2, $3
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shr_s $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_noext_v2i64:
+; NO-SIMD128-FAST:         .functype shr_s_noext_v2i64 (i32, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push1=, $2, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <2 x i64> undef, i64 %x, i32 0
   %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %a = ashr <2 x i64> %v, %s
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_s_const_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_s_const_v2i64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
-; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_s_const_v2i64(<2 x i64> %v) {
+; SIMD128-LABEL: shr_s_const_v2i64:
+; SIMD128:         .functype shr_s_const_v2i64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32.const $push0=, 5
+; SIMD128-NEXT:    i64x2.shr_s $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: shr_s_const_v2i64:
+; SIMD128-FAST:         .functype shr_s_const_v2i64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32.const $push1=, 5
+; SIMD128-FAST-NEXT:    i64x2.shr_s $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_const_v2i64:
+; NO-SIMD128:         .functype shr_s_const_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 5
+; NO-SIMD128-NEXT:    i64.shr_s $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i64.const $push3=, 5
+; NO-SIMD128-NEXT:    i64.shr_s $push2=, $1, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_const_v2i64:
+; NO-SIMD128-FAST:         .functype shr_s_const_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, 5
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i64.const $push3=, 5
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push2=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <2 x i64> %v, <i64 5, i64 5>
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_s_vec_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_s_vec_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i64.shr_s $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L4:[0-9]+]]=, $0, 1{{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L5:[0-9]+]]=, $1, 1{{$}}
-; SIMD128-NEXT: i64.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
-; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L3]], 1, $pop[[L6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_s_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
+; SIMD128-LABEL: shr_s_vec_v2i64:
+; SIMD128:         .functype shr_s_vec_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extract_lane $push4=, $0, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push3=, $1, 0
+; SIMD128-NEXT:    i64.shr_s $push5=, $pop4, $pop3
+; SIMD128-NEXT:    i64x2.splat $push6=, $pop5
+; SIMD128-NEXT:    i64x2.extract_lane $push1=, $0, 1
+; SIMD128-NEXT:    i64x2.extract_lane $push0=, $1, 1
+; SIMD128-NEXT:    i64.shr_s $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i64x2.replace_lane $push7=, $pop6, 1, $pop2
+; SIMD128-NEXT:    return $pop7
+;
+; SIMD128-FAST-LABEL: shr_s_vec_v2i64:
+; SIMD128-FAST:         .functype shr_s_vec_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push5=, $0, 0
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push4=, $1, 0
+; SIMD128-FAST-NEXT:    i64.shr_s $push6=, $pop5, $pop4
+; SIMD128-FAST-NEXT:    i64x2.splat $push7=, $pop6
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push2=, $0, 1
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i64.shr_s $push3=, $pop2, $pop1
+; SIMD128-FAST-NEXT:    i64x2.replace_lane $push0=, $pop7, 1, $pop3
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_s_vec_v2i64:
+; NO-SIMD128:         .functype shr_s_vec_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.shr_s $push0=, $2, $4
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shr_s $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_s_vec_v2i64:
+; NO-SIMD128-FAST:         .functype shr_s_vec_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shr_s $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <2 x i64> %v, %x
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_u_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_u_v2i64 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.shr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_u_v2i64(<2 x i64> %v, i32 %x) {
+; SIMD128-LABEL: shr_u_v2i64:
+; SIMD128:         .functype shr_u_v2i64 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.shr_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_u_v2i64:
+; SIMD128-FAST:         .functype shr_u_v2i64 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.shr_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_v2i64:
+; NO-SIMD128:         .functype shr_u_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push3=, $3
+; NO-SIMD128-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-NEXT:    i64.shr_u $push0=, $2, $pop2
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shr_u $push1=, $1, $4
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_v2i64:
+; NO-SIMD128-FAST:         .functype shr_u_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push3=, $3
+; NO-SIMD128-FAST-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push0=, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push1=, $1, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %x2 = zext i32 %x to i64
   %t = insertelement <2 x i64> undef, i64 %x2, i32 0
   %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
@@ -1362,12 +12340,40 @@ define <2 x i64> @shr_u_v2i64(<2 x i64> %v, i32 %x) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_u_sext_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_u_sext_v2i64 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.shr_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_u_sext_v2i64(<2 x i64> %v, i32 %x) {
+; SIMD128-LABEL: shr_u_sext_v2i64:
+; SIMD128:         .functype shr_u_sext_v2i64 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.shr_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: shr_u_sext_v2i64:
+; SIMD128-FAST:         .functype shr_u_sext_v2i64 (v128, i32) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.shr_u $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_sext_v2i64:
+; NO-SIMD128:         .functype shr_u_sext_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push3=, $3
+; NO-SIMD128-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-NEXT:    i64.shr_u $push0=, $2, $pop2
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shr_u $push1=, $1, $4
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_sext_v2i64:
+; NO-SIMD128-FAST:         .functype shr_u_sext_v2i64 (i32, i64, i64, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push3=, $3
+; NO-SIMD128-FAST-NEXT:    local.tee $push2=, $4=, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push0=, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push1=, $1, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %x2 = sext i32 %x to i64
   %t = insertelement <2 x i64> undef, i64 %x2, i32 0
   %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
@@ -1375,112 +12381,365 @@ define <2 x i64> @shr_u_sext_v2i64(<2 x i64> %v, i32 %x) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_u_noext_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_u_noext_v2i64 (v128, i64) -> (v128){{$}}
-; SIMD128-NEXT: i32.wrap_i64 $push[[L0:[0-9]+]]=, $1{{$}}
-; SIMD128-NEXT: i64x2.shr_u $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_u_noext_v2i64(<2 x i64> %v, i64 %x) {
+; SIMD128-LABEL: shr_u_noext_v2i64:
+; SIMD128:         .functype shr_u_noext_v2i64 (v128, i64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32.wrap_i64 $push0=, $1
+; SIMD128-NEXT:    i64x2.shr_u $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: shr_u_noext_v2i64:
+; SIMD128-FAST:         .functype shr_u_noext_v2i64 (v128, i64) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32.wrap_i64 $push1=, $1
+; SIMD128-FAST-NEXT:    i64x2.shr_u $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_noext_v2i64:
+; NO-SIMD128:         .functype shr_u_noext_v2i64 (i32, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.shr_u $push0=, $2, $3
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shr_u $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_noext_v2i64:
+; NO-SIMD128-FAST:         .functype shr_u_noext_v2i64 (i32, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push1=, $2, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <2 x i64> undef, i64 %x, i32 0
   %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   %a = lshr <2 x i64> %v, %s
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_u_const_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_u_const_v2i64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
-; SIMD128-NEXT: i64x2.shr_u $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_u_const_v2i64(<2 x i64> %v) {
+; SIMD128-LABEL: shr_u_const_v2i64:
+; SIMD128:         .functype shr_u_const_v2i64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32.const $push0=, 5
+; SIMD128-NEXT:    i64x2.shr_u $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: shr_u_const_v2i64:
+; SIMD128-FAST:         .functype shr_u_const_v2i64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i32.const $push1=, 5
+; SIMD128-FAST-NEXT:    i64x2.shr_u $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_const_v2i64:
+; NO-SIMD128:         .functype shr_u_const_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 5
+; NO-SIMD128-NEXT:    i64.shr_u $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i64.const $push3=, 5
+; NO-SIMD128-NEXT:    i64.shr_u $push2=, $1, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_const_v2i64:
+; NO-SIMD128-FAST:         .functype shr_u_const_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, 5
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i64.const $push3=, 5
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push2=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <2 x i64> %v, <i64 5, i64 5>
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: shr_u_vec_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype shr_u_vec_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
-; SIMD128-NEXT: i64.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L4:[0-9]+]]=, $0, 1{{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[L5:[0-9]+]]=, $1, 1{{$}}
-; SIMD128-NEXT: i64.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
-; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L3]], 1, $pop[[L6]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shr_u_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
+; SIMD128-LABEL: shr_u_vec_v2i64:
+; SIMD128:         .functype shr_u_vec_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extract_lane $push4=, $0, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push3=, $1, 0
+; SIMD128-NEXT:    i64.shr_u $push5=, $pop4, $pop3
+; SIMD128-NEXT:    i64x2.splat $push6=, $pop5
+; SIMD128-NEXT:    i64x2.extract_lane $push1=, $0, 1
+; SIMD128-NEXT:    i64x2.extract_lane $push0=, $1, 1
+; SIMD128-NEXT:    i64.shr_u $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i64x2.replace_lane $push7=, $pop6, 1, $pop2
+; SIMD128-NEXT:    return $pop7
+;
+; SIMD128-FAST-LABEL: shr_u_vec_v2i64:
+; SIMD128-FAST:         .functype shr_u_vec_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push5=, $0, 0
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push4=, $1, 0
+; SIMD128-FAST-NEXT:    i64.shr_u $push6=, $pop5, $pop4
+; SIMD128-FAST-NEXT:    i64x2.splat $push7=, $pop6
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push2=, $0, 1
+; SIMD128-FAST-NEXT:    i64x2.extract_lane $push1=, $1, 1
+; SIMD128-FAST-NEXT:    i64.shr_u $push3=, $pop2, $pop1
+; SIMD128-FAST-NEXT:    i64x2.replace_lane $push0=, $pop7, 1, $pop3
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shr_u_vec_v2i64:
+; NO-SIMD128:         .functype shr_u_vec_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.shr_u $push0=, $2, $4
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.shr_u $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: shr_u_vec_v2i64:
+; NO-SIMD128-FAST:         .functype shr_u_vec_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.shr_u $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <2 x i64> %v, %x
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: and_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype and_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @and_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: and_v2i64:
+; SIMD128:         .functype and_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: and_v2i64:
+; SIMD128-FAST:         .functype and_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.and $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: and_v2i64:
+; NO-SIMD128:         .functype and_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.and $push0=, $2, $4
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.and $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: and_v2i64:
+; NO-SIMD128-FAST:         .functype and_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.and $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.and $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = and <2 x i64> %x, %y
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: or_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype or_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @or_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: or_v2i64:
+; SIMD128:         .functype or_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.or $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: or_v2i64:
+; SIMD128-FAST:         .functype or_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.or $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: or_v2i64:
+; NO-SIMD128:         .functype or_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.or $push0=, $2, $4
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.or $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: or_v2i64:
+; NO-SIMD128-FAST:         .functype or_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.or $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.or $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = or <2 x i64> %x, %y
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: xor_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype xor_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.xor $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @xor_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: xor_v2i64:
+; SIMD128:         .functype xor_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.xor $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: xor_v2i64:
+; SIMD128-FAST:         .functype xor_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: xor_v2i64:
+; NO-SIMD128:         .functype xor_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.xor $push0=, $2, $4
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.xor $push1=, $1, $3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: xor_v2i64:
+; NO-SIMD128-FAST:         .functype xor_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.xor $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.xor $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = xor <2 x i64> %x, %y
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: not_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype not_v2i64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @not_v2i64(<2 x i64> %x) {
+; SIMD128-LABEL: not_v2i64:
+; SIMD128:         .functype not_v2i64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.not $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: not_v2i64:
+; SIMD128-FAST:         .functype not_v2i64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: not_v2i64:
+; NO-SIMD128:         .functype not_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, -1
+; NO-SIMD128-NEXT:    i64.xor $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i64.const $push3=, -1
+; NO-SIMD128-NEXT:    i64.xor $push2=, $1, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: not_v2i64:
+; NO-SIMD128-FAST:         .functype not_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    i64.const $push3=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    return
   %a = xor <2 x i64> %x, <i64 -1, i64 -1>
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: andnot_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype andnot_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.andnot $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: return
 define <2 x i64> @andnot_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: andnot_v2i64:
+; SIMD128:         .functype andnot_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.andnot $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: andnot_v2i64:
+; SIMD128-FAST:         .functype andnot_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push0=, $1
+; SIMD128-FAST-NEXT:    v128.and $push1=, $0, $pop0
+; SIMD128-FAST-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: andnot_v2i64:
+; NO-SIMD128:         .functype andnot_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, -1
+; NO-SIMD128-NEXT:    i64.xor $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i64.and $push2=, $2, $pop1
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.const $push5=, -1
+; NO-SIMD128-NEXT:    i64.xor $push3=, $3, $pop5
+; NO-SIMD128-NEXT:    i64.and $push4=, $1, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop4
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: andnot_v2i64:
+; NO-SIMD128-FAST:         .functype andnot_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push1=, $3, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.and $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.const $push5=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push3=, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    i64.and $push4=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <2 x i64> %y, <i64 -1, i64 -1>
  %a = and <2 x i64> %x, %inv_y
  ret <2 x i64> %a
 }
 
-; CHECK-LABEL: bitselect_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_v2i64 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.or
-; SIMD128-FAST-NEXT: return
 define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
+; SIMD128-LABEL: bitselect_v2i64:
+; SIMD128:         .functype bitselect_v2i64 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $1, $2, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_v2i64:
+; SIMD128-FAST:         .functype bitselect_v2i64 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.not $push2=, $0
+; SIMD128-FAST-NEXT:    v128.and $push3=, $2, $pop2
+; SIMD128-FAST-NEXT:    v128.and $push0=, $1, $0
+; SIMD128-FAST-NEXT:    v128.or $push1=, $pop3, $pop0
+; SIMD128-FAST-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: bitselect_v2i64:
+; NO-SIMD128:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push1=, -1
+; NO-SIMD128-NEXT:    i64.xor $push2=, $2, $pop1
+; NO-SIMD128-NEXT:    i64.and $push3=, $6, $pop2
+; NO-SIMD128-NEXT:    i64.and $push0=, $4, $2
+; NO-SIMD128-NEXT:    i64.or $push4=, $pop3, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i64.const $push9=, -1
+; NO-SIMD128-NEXT:    i64.xor $push6=, $1, $pop9
+; NO-SIMD128-NEXT:    i64.and $push7=, $5, $pop6
+; NO-SIMD128-NEXT:    i64.and $push5=, $3, $1
+; NO-SIMD128-NEXT:    i64.or $push8=, $pop7, $pop5
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop8
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_v2i64:
+; NO-SIMD128-FAST:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i64.and $push3=, $5, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.and $push0=, $3, $1
+; NO-SIMD128-FAST-NEXT:    i64.or $push4=, $pop3, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i64.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push6=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i64.and $push7=, $6, $pop6
+; NO-SIMD128-FAST-NEXT:    i64.and $push5=, $4, $2
+; NO-SIMD128-FAST-NEXT:    i64.or $push8=, $pop7, $pop5
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <2 x i64> %v1, %c
   %inv_mask = xor <2 x i64> <i64 -1, i64 -1>, %c
   %masked_v2 = and <2 x i64> %v2, %inv_mask
@@ -1488,31 +12747,101 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: bitselect_xor_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_xor_v2i64 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.xor
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.xor
 define <2 x i64> @bitselect_xor_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
+; SIMD128-LABEL: bitselect_xor_v2i64:
+; SIMD128:         .functype bitselect_xor_v2i64 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $1, $2, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_xor_v2i64:
+; SIMD128-FAST:         .functype bitselect_xor_v2i64 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push2=, $1, $2
+; SIMD128-FAST-NEXT:    v128.and $push1=, $pop2, $0
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $pop1, $2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: bitselect_xor_v2i64:
+; NO-SIMD128:         .functype bitselect_xor_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.xor $push0=, $4, $6
+; NO-SIMD128-NEXT:    i64.and $push1=, $pop0, $2
+; NO-SIMD128-NEXT:    i64.xor $push2=, $pop1, $6
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.xor $push3=, $3, $5
+; NO-SIMD128-NEXT:    i64.and $push4=, $pop3, $1
+; NO-SIMD128-NEXT:    i64.xor $push5=, $pop4, $5
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_xor_v2i64:
+; NO-SIMD128-FAST:         .functype bitselect_xor_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.xor $push0=, $3, $5
+; NO-SIMD128-FAST-NEXT:    i64.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $pop1, $5
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.xor $push3=, $4, $6
+; NO-SIMD128-FAST-NEXT:    i64.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i64.xor $push5=, $pop4, $6
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <2 x i64> %v1, %v2
  %and = and <2 x i64> %xor1, %c
  %a = xor <2 x i64> %and, %v2
  ret <2 x i64> %a
 }
 
-; CHECK-LABEL: bitselect_xor_reversed_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype bitselect_xor_reversed_v2i64 (v128, v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $2, $1, $0{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
-; SIMD128-FAST-NEXT: v128.xor
-; SIMD128-FAST-NEXT: v128.not
-; SIMD128-FAST-NEXT: v128.and
-; SIMD128-FAST-NEXT: v128.xor
 define <2 x i64> @bitselect_xor_reversed_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
+; SIMD128-LABEL: bitselect_xor_reversed_v2i64:
+; SIMD128:         .functype bitselect_xor_reversed_v2i64 (v128, v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.bitselect $push0=, $2, $1, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: bitselect_xor_reversed_v2i64:
+; SIMD128-FAST:         .functype bitselect_xor_reversed_v2i64 (v128, v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.xor $push1=, $1, $2
+; SIMD128-FAST-NEXT:    v128.not $push2=, $0
+; SIMD128-FAST-NEXT:    v128.and $push3=, $pop1, $pop2
+; SIMD128-FAST-NEXT:    v128.xor $push0=, $pop3, $2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: bitselect_xor_reversed_v2i64:
+; NO-SIMD128:         .functype bitselect_xor_reversed_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.xor $push2=, $4, $6
+; NO-SIMD128-NEXT:    i64.const $push0=, -1
+; NO-SIMD128-NEXT:    i64.xor $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    i64.and $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i64.xor $push4=, $pop3, $6
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i64.xor $push6=, $3, $5
+; NO-SIMD128-NEXT:    i64.const $push9=, -1
+; NO-SIMD128-NEXT:    i64.xor $push5=, $1, $pop9
+; NO-SIMD128-NEXT:    i64.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i64.xor $push8=, $pop7, $5
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop8
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v2i64:
+; NO-SIMD128-FAST:         .functype bitselect_xor_reversed_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $3, $5
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.and $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push4=, $pop3, $5
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i64.xor $push6=, $4, $6
+; NO-SIMD128-FAST-NEXT:    i64.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push5=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i64.and $push7=, $pop6, $pop5
+; NO-SIMD128-FAST-NEXT:    i64.xor $push8=, $pop7, $6
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <2 x i64> %v1, %v2
  %notc = xor <2 x i64> %c, <i64 -1, i64 -1>
  %and = and <2 x i64> %xor1, %notc
@@ -1520,12 +12849,46 @@ define <2 x i64> @bitselect_xor_reversed_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x
  ret <2 x i64> %a
 }
 
-; CHECK-LABEL: extmul_low_s_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype extmul_low_s_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i64x2.extmul_low_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @extmul_low_s_v2i64(<4 x i32> %v1, <4 x i32> %v2) {
+; SIMD128-LABEL: extmul_low_s_v2i64:
+; SIMD128:         .functype extmul_low_s_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extmul_low_i32x4_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_low_s_v2i64:
+; SIMD128-FAST:         .functype extmul_low_s_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.extend_low_i32x4_s $push0=, $0
+; SIMD128-FAST-NEXT:    i64x2.extend_low_i32x4_s $push1=, $1
+; SIMD128-FAST-NEXT:    i64x2.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_low_s_v2i64:
+; NO-SIMD128:         .functype extmul_low_s_v2i64 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push1=, $2
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push0=, $6
+; NO-SIMD128-NEXT:    i64.mul $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push4=, $1
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push3=, $5
+; NO-SIMD128-NEXT:    i64.mul $push5=, $pop4, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_low_s_v2i64:
+; NO-SIMD128-FAST:         .functype extmul_low_s_v2i64 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push0=, $5
+; NO-SIMD128-FAST-NEXT:    i64.mul $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push4=, $2
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push3=, $6
+; NO-SIMD128-FAST-NEXT:    i64.mul $push5=, $pop4, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %low2 = shufflevector <4 x i32> %v2, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %extended1 = sext <2 x i32> %low1 to <2 x i64>
@@ -1534,12 +12897,46 @@ define <2 x i64> @extmul_low_s_v2i64(<4 x i32> %v1, <4 x i32> %v2) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: extmul_high_s_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype extmul_high_s_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i64x2.extmul_high_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @extmul_high_s_v2i64(<4 x i32> %v1, <4 x i32> %v2) {
+; SIMD128-LABEL: extmul_high_s_v2i64:
+; SIMD128:         .functype extmul_high_s_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extmul_high_i32x4_s $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_high_s_v2i64:
+; SIMD128-FAST:         .functype extmul_high_s_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.extend_high_i32x4_s $push0=, $0
+; SIMD128-FAST-NEXT:    i64x2.extend_high_i32x4_s $push1=, $1
+; SIMD128-FAST-NEXT:    i64x2.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_high_s_v2i64:
+; NO-SIMD128:         .functype extmul_high_s_v2i64 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push1=, $4
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push0=, $8
+; NO-SIMD128-NEXT:    i64.mul $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push4=, $3
+; NO-SIMD128-NEXT:    i64.extend_i32_s $push3=, $7
+; NO-SIMD128-NEXT:    i64.mul $push5=, $pop4, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_high_s_v2i64:
+; NO-SIMD128-FAST:         .functype extmul_high_s_v2i64 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push1=, $3
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push0=, $7
+; NO-SIMD128-FAST-NEXT:    i64.mul $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push4=, $4
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_s $push3=, $8
+; NO-SIMD128-FAST-NEXT:    i64.mul $push5=, $pop4, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %high2 = shufflevector <4 x i32> %v2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %extended1 = sext <2 x i32> %high1 to <2 x i64>
@@ -1548,12 +12945,46 @@ define <2 x i64> @extmul_high_s_v2i64(<4 x i32> %v1, <4 x i32> %v2) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: extmul_low_u_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype extmul_low_u_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i64x2.extmul_low_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @extmul_low_u_v2i64(<4 x i32> %v1, <4 x i32> %v2) {
+; SIMD128-LABEL: extmul_low_u_v2i64:
+; SIMD128:         .functype extmul_low_u_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extmul_low_i32x4_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_low_u_v2i64:
+; SIMD128-FAST:         .functype extmul_low_u_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.extend_low_i32x4_u $push0=, $0
+; SIMD128-FAST-NEXT:    i64x2.extend_low_i32x4_u $push1=, $1
+; SIMD128-FAST-NEXT:    i64x2.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_low_u_v2i64:
+; NO-SIMD128:         .functype extmul_low_u_v2i64 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push1=, $2
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push0=, $6
+; NO-SIMD128-NEXT:    i64.mul $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push4=, $1
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push3=, $5
+; NO-SIMD128-NEXT:    i64.mul $push5=, $pop4, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_low_u_v2i64:
+; NO-SIMD128-FAST:         .functype extmul_low_u_v2i64 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push1=, $1
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push0=, $5
+; NO-SIMD128-FAST-NEXT:    i64.mul $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push4=, $2
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push3=, $6
+; NO-SIMD128-FAST-NEXT:    i64.mul $push5=, $pop4, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %low2 = shufflevector <4 x i32> %v2, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %extended1 = zext <2 x i32> %low1 to <2 x i64>
@@ -1562,12 +12993,46 @@ define <2 x i64> @extmul_low_u_v2i64(<4 x i32> %v1, <4 x i32> %v2) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: extmul_high_u_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype extmul_high_u_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-SLOW-NEXT: i64x2.extmul_high_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-SLOW-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @extmul_high_u_v2i64(<4 x i32> %v1, <4 x i32> %v2) {
+; SIMD128-LABEL: extmul_high_u_v2i64:
+; SIMD128:         .functype extmul_high_u_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extmul_high_i32x4_u $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: extmul_high_u_v2i64:
+; SIMD128-FAST:         .functype extmul_high_u_v2i64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i64x2.extend_high_i32x4_u $push0=, $0
+; SIMD128-FAST-NEXT:    i64x2.extend_high_i32x4_u $push1=, $1
+; SIMD128-FAST-NEXT:    i64x2.mul $push2=, $pop0, $pop1
+; SIMD128-FAST-NEXT:    return $pop2
+;
+; NO-SIMD128-LABEL: extmul_high_u_v2i64:
+; NO-SIMD128:         .functype extmul_high_u_v2i64 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push1=, $4
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push0=, $8
+; NO-SIMD128-NEXT:    i64.mul $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push4=, $3
+; NO-SIMD128-NEXT:    i64.extend_i32_u $push3=, $7
+; NO-SIMD128-NEXT:    i64.mul $push5=, $pop4, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: extmul_high_u_v2i64:
+; NO-SIMD128-FAST:         .functype extmul_high_u_v2i64 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push1=, $3
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push0=, $7
+; NO-SIMD128-FAST-NEXT:    i64.mul $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push4=, $4
+; NO-SIMD128-FAST-NEXT:    i64.extend_i32_u $push3=, $8
+; NO-SIMD128-FAST-NEXT:    i64.mul $push5=, $pop4, $pop3
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %high2 = shufflevector <4 x i32> %v2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %extended1 = zext <2 x i32> %high1 to <2 x i64>
@@ -1579,130 +13044,550 @@ define <2 x i64> @extmul_high_u_v2i64(<4 x i32> %v1, <4 x i32> %v2) {
 ; ==============================================================================
 ; 4 x float
 ; ==============================================================================
-; CHECK-LABEL: neg_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype neg_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.neg $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @neg_v4f32(<4 x float> %x) {
   ; nsz makes this semantically equivalent to flipping sign bit
+; SIMD128-LABEL: neg_v4f32:
+; SIMD128:         .functype neg_v4f32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.neg $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: neg_v4f32:
+; SIMD128-FAST:         .functype neg_v4f32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.neg $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: neg_v4f32:
+; NO-SIMD128:         .functype neg_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.neg $push0=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f32.neg $push1=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    f32.neg $push2=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-NEXT:    f32.neg $push5=, $4
+; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: neg_v4f32:
+; NO-SIMD128-FAST:         .functype neg_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.neg $push0=, $1
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f32.neg $push1=, $2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.neg $push2=, $3
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.neg $push5=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = fsub nsz <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, %x
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: abs_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype abs_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.abs $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
 define <4 x float> @abs_v4f32(<4 x float> %x) {
+; SIMD128-LABEL: abs_v4f32:
+; SIMD128:         .functype abs_v4f32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.abs $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: abs_v4f32:
+; SIMD128-FAST:         .functype abs_v4f32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.abs $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: abs_v4f32:
+; NO-SIMD128:         .functype abs_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.abs $push0=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f32.abs $push1=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    f32.abs $push2=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-NEXT:    f32.abs $push5=, $4
+; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: abs_v4f32:
+; NO-SIMD128-FAST:         .functype abs_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.abs $push0=, $1
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f32.abs $push1=, $2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.abs $push2=, $3
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.abs $push5=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: min_unordered_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype min_unordered_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}}
-; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @min_unordered_v4f32(<4 x float> %x) {
+; SIMD128-LABEL: min_unordered_v4f32:
+; SIMD128:         .functype min_unordered_v4f32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    f32x4.min $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: min_unordered_v4f32:
+; SIMD128-FAST:         .functype min_unordered_v4f32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push1=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    f32x4.min $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_unordered_v4f32:
+; NO-SIMD128:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.min $push1=, $3, $pop0
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.min $push2=, $2, $pop9
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push8=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.min $push3=, $1, $pop8
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 12
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    f32.const $push7=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.min $push4=, $4, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_unordered_v4f32:
+; NO-SIMD128-FAST:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.min $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.min $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    f32.const $push8=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.min $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.min $push6=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ule <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
     <4 x float> <float 5., float 5., float 5., float 5.>
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: max_unordered_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype max_unordered_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
-; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
+; SIMD128-LABEL: max_unordered_v4f32:
+; SIMD128:         .functype max_unordered_v4f32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    f32x4.max $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: max_unordered_v4f32:
+; SIMD128-FAST:         .functype max_unordered_v4f32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push1=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    f32x4.max $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_unordered_v4f32:
+; NO-SIMD128:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.max $push1=, $3, $pop0
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.max $push2=, $2, $pop9
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push8=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.max $push3=, $1, $pop8
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 12
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    f32.const $push7=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.max $push4=, $4, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_unordered_v4f32:
+; NO-SIMD128-FAST:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.max $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.max $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    f32.const $push8=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.max $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.max $push6=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp uge <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
     <4 x float> <float 5., float 5., float 5., float 5.>
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: min_ordered_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype min_ordered_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}}
-; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
+; SIMD128-LABEL: min_ordered_v4f32:
+; SIMD128:         .functype min_ordered_v4f32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    f32x4.min $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: min_ordered_v4f32:
+; SIMD128-FAST:         .functype min_ordered_v4f32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push1=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    f32x4.min $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_ordered_v4f32:
+; NO-SIMD128:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.min $push1=, $3, $pop0
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.min $push2=, $2, $pop9
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push8=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.min $push3=, $1, $pop8
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 12
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    f32.const $push7=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.min $push4=, $4, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_ordered_v4f32:
+; NO-SIMD128-FAST:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.min $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.min $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    f32.const $push8=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.min $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.min $push6=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ole <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
     <4 x float> <float 5., float 5., float 5., float 5.>, <4 x float> %x
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: max_ordered_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype max_ordered_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}}
-; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
+; SIMD128-LABEL: max_ordered_v4f32:
+; SIMD128:         .functype max_ordered_v4f32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    f32x4.max $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: max_ordered_v4f32:
+; SIMD128-FAST:         .functype max_ordered_v4f32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push1=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    f32x4.max $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_ordered_v4f32:
+; NO-SIMD128:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.max $push1=, $3, $pop0
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.max $push2=, $2, $pop9
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push8=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.max $push3=, $1, $pop8
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 12
+; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    f32.const $push7=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.max $push4=, $4, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_ordered_v4f32:
+; NO-SIMD128-FAST:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.max $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.max $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-FAST-NEXT:    f32.const $push8=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.max $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.max $push6=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp oge <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
     <4 x float> <float 5., float 5., float 5., float 5.>, <4 x float> %x
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: min_intrinsic_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype min_intrinsic_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
 define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: min_intrinsic_v4f32:
+; SIMD128:         .functype min_intrinsic_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.min $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_intrinsic_v4f32:
+; SIMD128-FAST:         .functype min_intrinsic_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.min $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_intrinsic_v4f32:
+; NO-SIMD128:         .functype min_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.min $push0=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f32.min $push1=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    f32.min $push2=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    f32.min $push3=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_intrinsic_v4f32:
+; NO-SIMD128-FAST:         .functype min_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.min $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f32.min $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.min $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.min $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: minnum_intrinsic_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype minnum_intrinsic_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
 define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: minnum_intrinsic_v4f32:
+; SIMD128:         .functype minnum_intrinsic_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.min $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: minnum_intrinsic_v4f32:
+; SIMD128-FAST:         .functype minnum_intrinsic_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.min $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: minnum_intrinsic_v4f32:
+; NO-SIMD128:         .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    call $push0=, fminf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fminf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fminf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-NEXT:    call $push5=, fminf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: minnum_intrinsic_v4f32:
+; NO-SIMD128-FAST:         .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    call $push0=, fminf, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    call $push1=, fminf, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: max_intrinsic_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype max_intrinsic_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
 define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: max_intrinsic_v4f32:
+; SIMD128:         .functype max_intrinsic_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.max $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_intrinsic_v4f32:
+; SIMD128-FAST:         .functype max_intrinsic_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.max $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_intrinsic_v4f32:
+; NO-SIMD128:         .functype max_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.max $push0=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f32.max $push1=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    f32.max $push2=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    f32.max $push3=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_intrinsic_v4f32:
+; NO-SIMD128-FAST:         .functype max_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.max $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f32.max $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.max $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.max $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: maxnum_intrinsic_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype maxnum_intrinsic_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
 define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: maxnum_intrinsic_v4f32:
+; SIMD128:         .functype maxnum_intrinsic_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.max $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: maxnum_intrinsic_v4f32:
+; SIMD128-FAST:         .functype maxnum_intrinsic_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.max $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: maxnum_intrinsic_v4f32:
+; NO-SIMD128:         .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    call $push0=, fmaxf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-NEXT:    call $push5=, fmaxf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: maxnum_intrinsic_v4f32:
+; NO-SIMD128-FAST:         .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    call $push0=, fmaxf, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    call $push1=, fmaxf, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    call $push2=, fmaxf, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: min_const_intrinsic_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype min_const_intrinsic_v4f32 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @min_const_intrinsic_v4f32() {
+; SIMD128-LABEL: min_const_intrinsic_v4f32:
+; SIMD128:         .functype min_const_intrinsic_v4f32 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_const_intrinsic_v4f32:
+; SIMD128-FAST:         .functype min_const_intrinsic_v4f32 () -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_const_intrinsic_v4f32:
+; NO-SIMD128:         .functype min_const_intrinsic_v4f32 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 4656722015785320448
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 4656722015785320448
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_const_intrinsic_v4f32:
+; NO-SIMD128-FAST:         .functype min_const_intrinsic_v4f32 (i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, 4656722015785320448
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.const $push1=, 4656722015785320448
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.minimum.v4f32(
     <4 x float> <float 42., float 42., float 42., float 42.>,
     <4 x float> <float 5., float 5., float 5., float 5.>
@@ -1710,12 +13595,36 @@ define <4 x float> @min_const_intrinsic_v4f32() {
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: max_const_intrinsic_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype max_const_intrinsic_v4f32 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @max_const_intrinsic_v4f32() {
+; SIMD128-LABEL: max_const_intrinsic_v4f32:
+; SIMD128:         .functype max_const_intrinsic_v4f32 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_const_intrinsic_v4f32:
+; SIMD128-FAST:         .functype max_const_intrinsic_v4f32 () -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push0=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_const_intrinsic_v4f32:
+; NO-SIMD128:         .functype max_const_intrinsic_v4f32 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 4767060206681587712
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 4767060206681587712
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_const_intrinsic_v4f32:
+; NO-SIMD128-FAST:         .functype max_const_intrinsic_v4f32 (i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, 4767060206681587712
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.const $push1=, 4767060206681587712
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.maximum.v4f32(
     <4 x float> <float 42., float 42., float 42., float 42.>,
     <4 x float> <float 5., float 5., float 5., float 5.>
@@ -1723,23 +13632,127 @@ define <4 x float> @max_const_intrinsic_v4f32() {
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: pmin_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype pmin_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmin_v4f32:
+; SIMD128:         .functype pmin_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmin $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmin_v4f32:
+; SIMD128-FAST:         .functype pmin_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmin $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmin_v4f32:
+; NO-SIMD128:         .functype pmin_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.lt $push0=, $7, $3
+; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $6, $2
+; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $5, $1
+; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    f32.lt $push6=, $8, $4
+; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmin_v4f32:
+; NO-SIMD128-FAST:         .functype pmin_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.lt $push0=, $5, $1
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $5, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.lt $push2=, $6, $2
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $7, $3
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $8, $4
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %y, %x
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: pmin_int_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype pmin_int_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: pmin_int_v4f32:
+; SIMD128:         .functype pmin_int_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmin $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmin_int_v4f32:
+; SIMD128-FAST:         .functype pmin_int_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmin $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmin_int_v4f32:
+; NO-SIMD128:         .functype pmin_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $8
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $4
+; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $3
+; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $6
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $2
+; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $5
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $1
+; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
+; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmin_int_v4f32:
+; NO-SIMD128-FAST:         .functype pmin_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push1=, $5
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push0=, $1
+; NO-SIMD128-FAST-NEXT:    f32.lt $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $5, $1, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push5=, $6
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push4=, $2
+; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $6, $2, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push9=, $7
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push8=, $3
+; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $8
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $4
+; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
   %c = fcmp olt <4 x float> %fy, %fx
@@ -1747,23 +13760,127 @@ define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: pmax_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype pmax_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: pmax_v4f32:
+; SIMD128:         .functype pmax_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmax_v4f32:
+; SIMD128-FAST:         .functype pmax_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmax_v4f32:
+; NO-SIMD128:         .functype pmax_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.lt $push0=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    f32.lt $push6=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmax_v4f32:
+; NO-SIMD128-FAST:         .functype pmax_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.lt $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.select $push1=, $5, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.lt $push2=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.select $push3=, $6, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %x, %y
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: pmax_int_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype pmax_int_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: pmax_int_v4f32:
+; SIMD128:         .functype pmax_int_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmax_int_v4f32:
+; SIMD128-FAST:         .functype pmax_int_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.pmax $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmax_int_v4f32:
+; NO-SIMD128:         .functype pmax_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $4
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $8
+; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
+; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $7
+; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
+; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $2
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $6
+; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $1
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $5
+; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
+; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmax_int_v4f32:
+; NO-SIMD128-FAST:         .functype pmax_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push1=, $1
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push0=, $5
+; NO-SIMD128-FAST-NEXT:    f32.lt $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.select $push3=, $5, $1, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push5=, $2
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push4=, $6
+; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $6, $2, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push9=, $3
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push8=, $7
+; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $4
+; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $8
+; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
   %c = fcmp olt <4 x float> %fx, %fy
@@ -1771,53 +13888,233 @@ define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: add_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype add_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.add $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: add_v4f32:
+; SIMD128:         .functype add_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.add $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: add_v4f32:
+; SIMD128-FAST:         .functype add_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.add $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: add_v4f32:
+; NO-SIMD128:         .functype add_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.add $push0=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f32.add $push1=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    f32.add $push2=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    f32.add $push3=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: add_v4f32:
+; NO-SIMD128-FAST:         .functype add_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.add $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f32.add $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.add $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.add $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = fadd <4 x float> %x, %y
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: sub_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype sub_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: sub_v4f32:
+; SIMD128:         .functype sub_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.sub $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: sub_v4f32:
+; SIMD128-FAST:         .functype sub_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.sub $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: sub_v4f32:
+; NO-SIMD128:         .functype sub_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.sub $push0=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f32.sub $push1=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    f32.sub $push2=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    f32.sub $push3=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: sub_v4f32:
+; NO-SIMD128-FAST:         .functype sub_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.sub $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f32.sub $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.sub $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.sub $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = fsub <4 x float> %x, %y
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: div_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype div_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.div $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: div_v4f32:
+; SIMD128:         .functype div_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.div $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: div_v4f32:
+; SIMD128-FAST:         .functype div_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.div $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: div_v4f32:
+; NO-SIMD128:         .functype div_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.div $push0=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f32.div $push1=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    f32.div $push2=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    f32.div $push3=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: div_v4f32:
+; NO-SIMD128-FAST:         .functype div_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.div $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f32.div $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.div $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.div $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = fdiv <4 x float> %x, %y
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: mul_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype mul_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: mul_v4f32:
+; SIMD128:         .functype mul_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.mul $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: mul_v4f32:
+; SIMD128-FAST:         .functype mul_v4f32 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.mul $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: mul_v4f32:
+; NO-SIMD128:         .functype mul_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.mul $push0=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f32.mul $push1=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    f32.mul $push2=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push4=, 12
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    f32.mul $push3=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: mul_v4f32:
+; NO-SIMD128-FAST:         .functype mul_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.mul $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f32.mul $push1=, $2, $6
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.mul $push2=, $3, $7
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.mul $push5=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = fmul <4 x float> %x, %y
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: sqrt_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype sqrt_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.sqrt $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
 define <4 x float> @sqrt_v4f32(<4 x float> %x) {
+; SIMD128-LABEL: sqrt_v4f32:
+; SIMD128:         .functype sqrt_v4f32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.sqrt $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: sqrt_v4f32:
+; SIMD128-FAST:         .functype sqrt_v4f32 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f32x4.sqrt $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: sqrt_v4f32:
+; NO-SIMD128:         .functype sqrt_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.sqrt $push0=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f32.sqrt $push1=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-NEXT:    f32.sqrt $push2=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-NEXT:    f32.sqrt $push5=, $4
+; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: sqrt_v4f32:
+; NO-SIMD128-FAST:         .functype sqrt_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f32.sqrt $push0=, $1
+; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f32.sqrt $push1=, $2
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f32.sqrt $push2=, $3
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.sqrt $push5=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
   ret <4 x float> %a
 }
@@ -1825,108 +14122,344 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; ==============================================================================
 ; 2 x double
 ; ==============================================================================
-; CHECK-LABEL: neg_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype neg_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.neg $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @neg_v2f64(<2 x double> %x) {
   ; nsz makes this semantically equivalent to flipping sign bit
+; SIMD128-LABEL: neg_v2f64:
+; SIMD128:         .functype neg_v2f64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.neg $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: neg_v2f64:
+; SIMD128-FAST:         .functype neg_v2f64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.neg $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: neg_v2f64:
+; NO-SIMD128:         .functype neg_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.neg $push0=, $2
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f64.neg $push1=, $1
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: neg_v2f64:
+; NO-SIMD128-FAST:         .functype neg_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.neg $push0=, $1
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f64.neg $push1=, $2
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = fsub nsz <2 x double> <double 0., double 0.>, %x
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: abs_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype abs_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.abs $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
 define <2 x double> @abs_v2f64(<2 x double> %x) {
+; SIMD128-LABEL: abs_v2f64:
+; SIMD128:         .functype abs_v2f64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.abs $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: abs_v2f64:
+; SIMD128-FAST:         .functype abs_v2f64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.abs $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: abs_v2f64:
+; NO-SIMD128:         .functype abs_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.abs $push0=, $2
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f64.abs $push1=, $1
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: abs_v2f64:
+; NO-SIMD128-FAST:         .functype abs_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.abs $push0=, $1
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f64.abs $push1=, $2
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: min_unordered_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype min_unordered_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
-; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @min_unordered_v2f64(<2 x double> %x) {
+; SIMD128-LABEL: min_unordered_v2f64:
+; SIMD128:         .functype min_unordered_v2f64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    f64x2.min $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: min_unordered_v2f64:
+; SIMD128-FAST:         .functype min_unordered_v2f64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push1=, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    f64x2.min $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_unordered_v2f64:
+; NO-SIMD128:         .functype min_unordered_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.const $push0=, 0x1.4p2
+; NO-SIMD128-NEXT:    f64.min $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f64.const $push3=, 0x1.4p2
+; NO-SIMD128-NEXT:    f64.min $push2=, $1, $pop3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop2
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_unordered_v2f64:
+; NO-SIMD128-FAST:         .functype min_unordered_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.const $push0=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f64.min $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f64.const $push3=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f64.min $push2=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ule <2 x double> %x, <double 5., double 5.>
   %a = select <2 x i1> %cmps, <2 x double> %x,
     <2 x double> <double 5., double 5.>
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: max_unordered_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype max_unordered_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
-; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @max_unordered_v2f64(<2 x double> %x) {
+; SIMD128-LABEL: max_unordered_v2f64:
+; SIMD128:         .functype max_unordered_v2f64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    f64x2.max $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: max_unordered_v2f64:
+; SIMD128-FAST:         .functype max_unordered_v2f64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push1=, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    f64x2.max $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_unordered_v2f64:
+; NO-SIMD128:         .functype max_unordered_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.const $push0=, 0x1.4p2
+; NO-SIMD128-NEXT:    f64.max $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f64.const $push3=, 0x1.4p2
+; NO-SIMD128-NEXT:    f64.max $push2=, $1, $pop3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop2
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_unordered_v2f64:
+; NO-SIMD128-FAST:         .functype max_unordered_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.const $push0=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f64.max $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f64.const $push3=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f64.max $push2=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp uge <2 x double> %x, <double 5., double 5.>
   %a = select <2 x i1> %cmps, <2 x double> %x,
     <2 x double> <double 5., double 5.>
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: min_ordered_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype min_ordered_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
-; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @min_ordered_v2f64(<2 x double> %x) {
+; SIMD128-LABEL: min_ordered_v2f64:
+; SIMD128:         .functype min_ordered_v2f64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    f64x2.min $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: min_ordered_v2f64:
+; SIMD128-FAST:         .functype min_ordered_v2f64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push1=, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    f64x2.min $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_ordered_v2f64:
+; NO-SIMD128:         .functype min_ordered_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.const $push0=, 0x1.4p2
+; NO-SIMD128-NEXT:    f64.min $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f64.const $push3=, 0x1.4p2
+; NO-SIMD128-NEXT:    f64.min $push2=, $1, $pop3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop2
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_ordered_v2f64:
+; NO-SIMD128-FAST:         .functype min_ordered_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.const $push0=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f64.min $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f64.const $push3=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f64.min $push2=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ole <2 x double> <double 5., double 5.>, %x
   %a = select <2 x i1> %cmps, <2 x double> <double 5., double 5.>,
     <2 x double> %x
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: max_ordered_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype max_ordered_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
-; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @max_ordered_v2f64(<2 x double> %x) {
+; SIMD128-LABEL: max_ordered_v2f64:
+; SIMD128:         .functype max_ordered_v2f64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    f64x2.max $push1=, $0, $pop0
+; SIMD128-NEXT:    return $pop1
+;
+; SIMD128-FAST-LABEL: max_ordered_v2f64:
+; SIMD128-FAST:         .functype max_ordered_v2f64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push1=, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    f64x2.max $push0=, $0, $pop1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_ordered_v2f64:
+; NO-SIMD128:         .functype max_ordered_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.const $push0=, 0x1.4p2
+; NO-SIMD128-NEXT:    f64.max $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f64.const $push3=, 0x1.4p2
+; NO-SIMD128-NEXT:    f64.max $push2=, $1, $pop3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop2
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_ordered_v2f64:
+; NO-SIMD128-FAST:         .functype max_ordered_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.const $push0=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f64.max $push1=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f64.const $push3=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f64.max $push2=, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop2
+; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp oge <2 x double> <double 5., double 5.>, %x
   %a = select <2 x i1> %cmps, <2 x double> <double 5., double 5.>,
     <2 x double> %x
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: min_intrinsic_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype min_intrinsic_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
 define <2 x double> @min_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: min_intrinsic_v2f64:
+; SIMD128:         .functype min_intrinsic_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.min $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_intrinsic_v2f64:
+; SIMD128-FAST:         .functype min_intrinsic_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.min $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_intrinsic_v2f64:
+; NO-SIMD128:         .functype min_intrinsic_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.min $push0=, $2, $4
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f64.min $push1=, $1, $3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_intrinsic_v2f64:
+; NO-SIMD128-FAST:         .functype min_intrinsic_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.min $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f64.min $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y)
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: max_intrinsic_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype max_intrinsic_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
 define <2 x double> @max_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: max_intrinsic_v2f64:
+; SIMD128:         .functype max_intrinsic_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.max $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_intrinsic_v2f64:
+; SIMD128-FAST:         .functype max_intrinsic_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.max $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_intrinsic_v2f64:
+; NO-SIMD128:         .functype max_intrinsic_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.max $push0=, $2, $4
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f64.max $push1=, $1, $3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_intrinsic_v2f64:
+; NO-SIMD128-FAST:         .functype max_intrinsic_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.max $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f64.max $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> %y)
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: min_const_intrinsic_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype min_const_intrinsic_v2f64 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.4p2, 0x1.4p2{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @min_const_intrinsic_v2f64() {
+; SIMD128-LABEL: min_const_intrinsic_v2f64:
+; SIMD128:         .functype min_const_intrinsic_v2f64 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: min_const_intrinsic_v2f64:
+; SIMD128-FAST:         .functype min_const_intrinsic_v2f64 () -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push0=, 0x1.4p2, 0x1.4p2
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: min_const_intrinsic_v2f64:
+; NO-SIMD128:         .functype min_const_intrinsic_v2f64 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 4617315517961601024
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 4617315517961601024
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: min_const_intrinsic_v2f64:
+; NO-SIMD128-FAST:         .functype min_const_intrinsic_v2f64 (i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, 4617315517961601024
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.const $push1=, 4617315517961601024
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <2 x double> @llvm.minimum.v2f64(
     <2 x double> <double 42., double 42.>,
     <2 x double> <double 5., double 5.>
@@ -1934,12 +14467,36 @@ define <2 x double> @min_const_intrinsic_v2f64() {
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: max_const_intrinsic_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype max_const_intrinsic_v2f64 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.5p5, 0x1.5p5{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @max_const_intrinsic_v2f64() {
+; SIMD128-LABEL: max_const_intrinsic_v2f64:
+; SIMD128:         .functype max_const_intrinsic_v2f64 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.5p5, 0x1.5p5
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: max_const_intrinsic_v2f64:
+; SIMD128-FAST:         .functype max_const_intrinsic_v2f64 () -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    v128.const $push0=, 0x1.5p5, 0x1.5p5
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: max_const_intrinsic_v2f64:
+; NO-SIMD128:         .functype max_const_intrinsic_v2f64 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 4631107791820423168
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 4631107791820423168
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: max_const_intrinsic_v2f64:
+; NO-SIMD128-FAST:         .functype max_const_intrinsic_v2f64 (i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i64.const $push0=, 4631107791820423168
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-FAST-NEXT:    i64.const $push1=, 4631107791820423168
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <2 x double> @llvm.maximum.v2f64(
     <2 x double> <double 42., double 42.>,
     <2 x double> <double 5., double 5.>
@@ -1947,23 +14504,87 @@ define <2 x double> @max_const_intrinsic_v2f64() {
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: pmin_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype pmin_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @pmin_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: pmin_v2f64:
+; SIMD128:         .functype pmin_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.pmin $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmin_v2f64:
+; SIMD128-FAST:         .functype pmin_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.pmin $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmin_v2f64:
+; NO-SIMD128:         .functype pmin_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.lt $push0=, $4, $2
+; NO-SIMD128-NEXT:    f64.select $push1=, $4, $2, $pop0
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f64.lt $push2=, $3, $1
+; NO-SIMD128-NEXT:    f64.select $push3=, $3, $1, $pop2
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmin_v2f64:
+; NO-SIMD128-FAST:         .functype pmin_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.lt $push0=, $3, $1
+; NO-SIMD128-FAST-NEXT:    f64.select $push1=, $3, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f64.lt $push2=, $4, $2
+; NO-SIMD128-FAST-NEXT:    f64.select $push3=, $4, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop3
+; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <2 x double> %y, %x
   %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: pmin_int_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype pmin_int_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @pmin_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: pmin_int_v2f64:
+; SIMD128:         .functype pmin_int_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.pmin $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmin_int_v2f64:
+; SIMD128-FAST:         .functype pmin_int_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.pmin $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmin_int_v2f64:
+; NO-SIMD128:         .functype pmin_int_v2f64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.reinterpret_i64 $push1=, $4
+; NO-SIMD128-NEXT:    f64.reinterpret_i64 $push0=, $2
+; NO-SIMD128-NEXT:    f64.lt $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i64.select $push3=, $4, $2, $pop2
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f64.reinterpret_i64 $push5=, $3
+; NO-SIMD128-NEXT:    f64.reinterpret_i64 $push4=, $1
+; NO-SIMD128-NEXT:    f64.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i64.select $push7=, $3, $1, $pop6
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmin_int_v2f64:
+; NO-SIMD128-FAST:         .functype pmin_int_v2f64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.reinterpret_i64 $push1=, $3
+; NO-SIMD128-FAST-NEXT:    f64.reinterpret_i64 $push0=, $1
+; NO-SIMD128-FAST-NEXT:    f64.lt $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.select $push3=, $3, $1, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f64.reinterpret_i64 $push5=, $4
+; NO-SIMD128-FAST-NEXT:    f64.reinterpret_i64 $push4=, $2
+; NO-SIMD128-FAST-NEXT:    f64.lt $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i64.select $push7=, $4, $2, $pop6
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <2 x i64> %x to <2 x double>
   %fy = bitcast <2 x i64> %y to <2 x double>
   %c = fcmp olt <2 x double> %fy, %fx
@@ -1971,23 +14592,87 @@ define <2 x i64> @pmin_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: pmax_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype pmax_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @pmax_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: pmax_v2f64:
+; SIMD128:         .functype pmax_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.pmax $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmax_v2f64:
+; SIMD128-FAST:         .functype pmax_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.pmax $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmax_v2f64:
+; NO-SIMD128:         .functype pmax_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.lt $push0=, $2, $4
+; NO-SIMD128-NEXT:    f64.select $push1=, $4, $2, $pop0
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f64.lt $push2=, $1, $3
+; NO-SIMD128-NEXT:    f64.select $push3=, $3, $1, $pop2
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop3
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmax_v2f64:
+; NO-SIMD128-FAST:         .functype pmax_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.lt $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    f64.select $push1=, $3, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-FAST-NEXT:    f64.lt $push2=, $2, $4
+; NO-SIMD128-FAST-NEXT:    f64.select $push3=, $4, $2, $pop2
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop3
+; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <2 x double> %x, %y
   %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: pmax_int_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype pmax_int_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @pmax_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: pmax_int_v2f64:
+; SIMD128:         .functype pmax_int_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.pmax $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: pmax_int_v2f64:
+; SIMD128-FAST:         .functype pmax_int_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.pmax $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: pmax_int_v2f64:
+; NO-SIMD128:         .functype pmax_int_v2f64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.reinterpret_i64 $push1=, $2
+; NO-SIMD128-NEXT:    f64.reinterpret_i64 $push0=, $4
+; NO-SIMD128-NEXT:    f64.lt $push2=, $pop1, $pop0
+; NO-SIMD128-NEXT:    i64.select $push3=, $4, $2, $pop2
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f64.reinterpret_i64 $push5=, $1
+; NO-SIMD128-NEXT:    f64.reinterpret_i64 $push4=, $3
+; NO-SIMD128-NEXT:    f64.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i64.select $push7=, $3, $1, $pop6
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop7
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: pmax_int_v2f64:
+; NO-SIMD128-FAST:         .functype pmax_int_v2f64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.reinterpret_i64 $push1=, $1
+; NO-SIMD128-FAST-NEXT:    f64.reinterpret_i64 $push0=, $3
+; NO-SIMD128-FAST-NEXT:    f64.lt $push2=, $pop1, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.select $push3=, $3, $1, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop3
+; NO-SIMD128-FAST-NEXT:    f64.reinterpret_i64 $push5=, $2
+; NO-SIMD128-FAST-NEXT:    f64.reinterpret_i64 $push4=, $4
+; NO-SIMD128-FAST-NEXT:    f64.lt $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i64.select $push7=, $4, $2, $pop6
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop7
+; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <2 x i64> %x to <2 x double>
   %fy = bitcast <2 x i64> %y to <2 x double>
   %c = fcmp olt <2 x double> %fx, %fy
@@ -1995,53 +14680,173 @@ define <2 x i64> @pmax_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: add_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype add_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.add $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @add_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: add_v2f64:
+; SIMD128:         .functype add_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.add $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: add_v2f64:
+; SIMD128-FAST:         .functype add_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.add $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: add_v2f64:
+; NO-SIMD128:         .functype add_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.add $push0=, $2, $4
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f64.add $push1=, $1, $3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: add_v2f64:
+; NO-SIMD128-FAST:         .functype add_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.add $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f64.add $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = fadd <2 x double> %x, %y
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: sub_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype sub_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.sub $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @sub_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: sub_v2f64:
+; SIMD128:         .functype sub_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.sub $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: sub_v2f64:
+; SIMD128-FAST:         .functype sub_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.sub $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: sub_v2f64:
+; NO-SIMD128:         .functype sub_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.sub $push0=, $2, $4
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f64.sub $push1=, $1, $3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: sub_v2f64:
+; NO-SIMD128-FAST:         .functype sub_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.sub $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f64.sub $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = fsub <2 x double> %x, %y
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: div_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype div_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.div $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @div_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: div_v2f64:
+; SIMD128:         .functype div_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.div $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: div_v2f64:
+; SIMD128-FAST:         .functype div_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.div $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: div_v2f64:
+; NO-SIMD128:         .functype div_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.div $push0=, $2, $4
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f64.div $push1=, $1, $3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: div_v2f64:
+; NO-SIMD128-FAST:         .functype div_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.div $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f64.div $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = fdiv <2 x double> %x, %y
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: mul_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype mul_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.mul $push[[R:[0-9]+]]=, $0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @mul_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: mul_v2f64:
+; SIMD128:         .functype mul_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.mul $push0=, $0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: mul_v2f64:
+; SIMD128-FAST:         .functype mul_v2f64 (v128, v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.mul $push0=, $0, $1
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: mul_v2f64:
+; NO-SIMD128:         .functype mul_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.mul $push0=, $2, $4
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f64.mul $push1=, $1, $3
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: mul_v2f64:
+; NO-SIMD128-FAST:         .functype mul_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.mul $push0=, $1, $3
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f64.mul $push1=, $2, $4
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = fmul <2 x double> %x, %y
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: sqrt_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype sqrt_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.sqrt $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
 define <2 x double> @sqrt_v2f64(<2 x double> %x) {
+; SIMD128-LABEL: sqrt_v2f64:
+; SIMD128:         .functype sqrt_v2f64 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.sqrt $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; SIMD128-FAST-LABEL: sqrt_v2f64:
+; SIMD128-FAST:         .functype sqrt_v2f64 (v128) -> (v128)
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    f64x2.sqrt $push0=, $0
+; SIMD128-FAST-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: sqrt_v2f64:
+; NO-SIMD128:         .functype sqrt_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.sqrt $push0=, $2
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    f64.sqrt $push1=, $1
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: sqrt_v2f64:
+; NO-SIMD128-FAST:         .functype sqrt_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    f64.sqrt $push0=, $1
+; NO-SIMD128-FAST-NEXT:    f64.store 0($0), $pop0
+; NO-SIMD128-FAST-NEXT:    f64.sqrt $push1=, $2
+; NO-SIMD128-FAST-NEXT:    f64.store 8($0), $pop1
+; NO-SIMD128-FAST-NEXT:    return
   %a = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %a
 }

diff  --git a/llvm/test/CodeGen/WebAssembly/simd-build-pair.ll b/llvm/test/CodeGen/WebAssembly/simd-build-pair.ll
index af1d80afaf3b6..8ea79ca45c295 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-build-pair.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-build-pair.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=simd128 | FileCheck %s --check-prefixes CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=simd128 | FileCheck %s
 
 target triple = "wasm32-unknown-unknown"
 
@@ -13,8 +14,14 @@ target triple = "wasm32-unknown-unknown"
 ;    t8: ch = store<(store 8 into `ptr undef`, align 1)> t3:1, t24, undef:i32, undef:i32
 ;  t9: ch = WebAssemblyISD::RETURN t8
 
-; CHECK:      v128.store64_lane
 define void @build_pair_i32s() {
+; CHECK-LABEL: build_pair_i32s:
+; CHECK:         .functype build_pair_i32s () -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    v128.load $push0=, 0($0)
+; CHECK-NEXT:    i8x16.shuffle $push1=, $pop0, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK-NEXT:    v128.store64_lane 0($0):p2align=0, $pop1, 0
+; CHECK-NEXT:    return
 entry:
   %0 = load <4 x i32>, ptr undef, align 16
   %shuffle.i184 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>

diff  --git a/llvm/test/CodeGen/WebAssembly/simd-illegal-signext.ll b/llvm/test/CodeGen/WebAssembly/simd-illegal-signext.ll
index 9e9bcbf21fe84..f448ae37778f8 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-illegal-signext.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-illegal-signext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mattr=+simd128 | FileCheck %s
 
 ; Regression test for a crash caused by
@@ -9,9 +10,16 @@
 
 target triple = "wasm32-unknown-emscripten"
 
-; CHECK: i32.load8_s
-; CHECK-NEXT: i32.store16
 define void @foo() {
+; CHECK-LABEL: foo:
+; CHECK:         .functype foo () -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.load8_s 0
+; CHECK-NEXT:    i32.store16 0
+; CHECK-NEXT:    # fallthrough-return
 entry:
   %0 = load ptr, ptr undef, align 4
   %1 = load i32, ptr %0, align 4

diff  --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
index 723beb4166114..d2a38de4cc855 100644
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+sign-ext | FileCheck %s --check-prefix=SIMD128
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefix=NO-SIMD128
 
 ; Test that basic SIMD128 vector manipulation operations assemble as expected.
 
@@ -8,23 +9,74 @@ target triple = "wasm32-unknown-unknown"
 ; ==============================================================================
 ; 16 x i8
 ; ==============================================================================
-; CHECK-LABEL: const_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype const_v16i8 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=,
-; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @const_v16i8() {
+; SIMD128-LABEL: const_v16i8:
+; SIMD128:         .functype const_v16i8 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_v16i8:
+; NO-SIMD128:         .functype const_v16i8 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 1084818905618843912
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 506097522914230528
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <16 x i8> <i8 00, i8 01, i8 02, i8 03, i8 04, i8 05, i8 06, i8 07,
                  i8 08, i8 09, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>
 }
 
-; CHECK-LABEL: splat_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype splat_v16i8 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.splat $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @splat_v16i8(i8 %x) {
+; SIMD128-LABEL: splat_v16i8:
+; SIMD128:         .functype splat_v16i8 (i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.splat $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: splat_v16i8:
+; NO-SIMD128:         .functype splat_v16i8 (i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 8($0), $1
+; NO-SIMD128-NEXT:    i32.store8 4($0), $1
+; NO-SIMD128-NEXT:    i32.store8 2($0), $1
+; NO-SIMD128-NEXT:    i32.store8 1($0), $1
+; NO-SIMD128-NEXT:    i32.store8 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store8 0($pop1), $1
+; NO-SIMD128-NEXT:    i32.const $push2=, 14
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store8 0($pop3), $1
+; NO-SIMD128-NEXT:    i32.const $push4=, 13
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store8 0($pop5), $1
+; NO-SIMD128-NEXT:    i32.const $push6=, 12
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $1
+; NO-SIMD128-NEXT:    i32.const $push8=, 11
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.store8 0($pop9), $1
+; NO-SIMD128-NEXT:    i32.const $push10=, 10
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $1
+; NO-SIMD128-NEXT:    i32.const $push12=, 9
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $1
+; NO-SIMD128-NEXT:    i32.const $push14=, 7
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.store8 0($pop15), $1
+; NO-SIMD128-NEXT:    i32.const $push16=, 6
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $1
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $1
+; NO-SIMD128-NEXT:    i32.const $push20=, 3
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.store8 0($pop21), $1
+; NO-SIMD128-NEXT:    return
   %v = insertelement <16 x i8> undef, i8 %x, i32 0
   %res = shufflevector <16 x i8> %v, <16 x i8> undef,
     <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
@@ -32,192 +84,530 @@ define <16 x i8> @splat_v16i8(i8 %x) {
   ret <16 x i8> %res
 }
 
-; CHECK-LABEL: const_splat_v16i8:
-; SIMD128: v128.const $push0=, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42{{$}}
 define <16 x i8> @const_splat_v16i8() {
+; SIMD128-LABEL: const_splat_v16i8:
+; SIMD128:         .functype const_splat_v16i8 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_splat_v16i8:
+; NO-SIMD128:         .functype const_splat_v16i8 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 3038287259199220266
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 3038287259199220266
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <16 x i8> <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42,
                  i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
 }
 
-; CHECK-LABEL: extract_v16i8_s:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype extract_v16i8_s (v128) -> (i32){{$}}
-; SIMD128-NEXT: i8x16.extract_lane_s $push[[R:[0-9]+]]=, $0, 13{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v16i8_s(<16 x i8> %v) {
+; SIMD128-LABEL: extract_v16i8_s:
+; SIMD128:         .functype extract_v16i8_s (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_s $push0=, $0, 13
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v16i8_s:
+; NO-SIMD128:         .functype extract_v16i8_s (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $13
+; NO-SIMD128-NEXT:    return $pop0
   %elem = extractelement <16 x i8> %v, i8 13
   %a = sext i8 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_var_v16i8_s:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype extract_var_v16i8_s (v128, i32) -> (i32){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]
-; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $2, $pop[[L5]]
-; SIMD128-NEXT: i32.load8_s $push[[R:[0-9]+]]=, 0($pop[[L6]])
-; SIMD128-NEXT: return $pop[[R]]
 define i32 @extract_var_v16i8_s(<16 x i8> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v16i8_s:
+; SIMD128:         .functype extract_var_v16i8_s (v128, i32) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push4=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push5=, 16
+; SIMD128-NEXT:    i32.sub $push7=, $pop4, $pop5
+; SIMD128-NEXT:    local.tee $push6=, $2=, $pop7
+; SIMD128-NEXT:    v128.store 0($pop6), $0
+; SIMD128-NEXT:    i32.const $push0=, 15
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.or $push2=, $2, $pop1
+; SIMD128-NEXT:    i32.load8_s $push3=, 0($pop2)
+; SIMD128-NEXT:    return $pop3
+;
+; NO-SIMD128-LABEL: extract_var_v16i8_s:
+; NO-SIMD128:         .functype extract_var_v16i8_s (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push4=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push5=, 16
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop4, $pop5
+; NO-SIMD128-NEXT:    local.tee $push6=, $17=, $pop7
+; NO-SIMD128-NEXT:    i32.store8 15($pop6), $15
+; NO-SIMD128-NEXT:    i32.store8 14($17), $14
+; NO-SIMD128-NEXT:    i32.store8 13($17), $13
+; NO-SIMD128-NEXT:    i32.store8 12($17), $12
+; NO-SIMD128-NEXT:    i32.store8 11($17), $11
+; NO-SIMD128-NEXT:    i32.store8 10($17), $10
+; NO-SIMD128-NEXT:    i32.store8 9($17), $9
+; NO-SIMD128-NEXT:    i32.store8 8($17), $8
+; NO-SIMD128-NEXT:    i32.store8 7($17), $7
+; NO-SIMD128-NEXT:    i32.store8 6($17), $6
+; NO-SIMD128-NEXT:    i32.store8 5($17), $5
+; NO-SIMD128-NEXT:    i32.store8 4($17), $4
+; NO-SIMD128-NEXT:    i32.store8 3($17), $3
+; NO-SIMD128-NEXT:    i32.store8 2($17), $2
+; NO-SIMD128-NEXT:    i32.store8 1($17), $1
+; NO-SIMD128-NEXT:    i32.store8 0($17), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.or $push2=, $17, $pop1
+; NO-SIMD128-NEXT:    i32.load8_s $push3=, 0($pop2)
+; NO-SIMD128-NEXT:    return $pop3
   %elem = extractelement <16 x i8> %v, i32 %i
   %a = sext i8 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_undef_v16i8_s:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype extract_undef_v16i8_s (v128) -> (i32){{$}}
-; SIMD128-NEXT: i8x16.extract_lane_s $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_undef_v16i8_s(<16 x i8> %v) {
+; SIMD128-LABEL: extract_undef_v16i8_s:
+; SIMD128:         .functype extract_undef_v16i8_s (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_s $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_undef_v16i8_s:
+; NO-SIMD128:         .functype extract_undef_v16i8_s (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $0
+; NO-SIMD128-NEXT:    return $pop0
   %elem = extractelement <16 x i8> %v, i8 undef
   %a = sext i8 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_v16i8_u:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype extract_v16i8_u (v128) -> (i32){{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 13{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v16i8_u(<16 x i8> %v) {
+; SIMD128-LABEL: extract_v16i8_u:
+; SIMD128:         .functype extract_v16i8_u (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $0, 13
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v16i8_u:
+; NO-SIMD128:         .functype extract_v16i8_u (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
+; NO-SIMD128-NEXT:    return $pop1
   %elem = extractelement <16 x i8> %v, i8 13
   %a = zext i8 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_var_v16i8_u:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype extract_var_v16i8_u (v128, i32) -> (i32){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $2, $pop[[L5]]{{$}}
-; SIMD128-NEXT: i32.load8_u $push[[R:[0-9]+]]=, 0($pop[[L6]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_var_v16i8_u(<16 x i8> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v16i8_u:
+; SIMD128:         .functype extract_var_v16i8_u (v128, i32) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push4=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push5=, 16
+; SIMD128-NEXT:    i32.sub $push7=, $pop4, $pop5
+; SIMD128-NEXT:    local.tee $push6=, $2=, $pop7
+; SIMD128-NEXT:    v128.store 0($pop6), $0
+; SIMD128-NEXT:    i32.const $push0=, 15
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.or $push2=, $2, $pop1
+; SIMD128-NEXT:    i32.load8_u $push3=, 0($pop2)
+; SIMD128-NEXT:    return $pop3
+;
+; NO-SIMD128-LABEL: extract_var_v16i8_u:
+; NO-SIMD128:         .functype extract_var_v16i8_u (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push4=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push5=, 16
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop4, $pop5
+; NO-SIMD128-NEXT:    local.tee $push6=, $17=, $pop7
+; NO-SIMD128-NEXT:    i32.store8 15($pop6), $15
+; NO-SIMD128-NEXT:    i32.store8 14($17), $14
+; NO-SIMD128-NEXT:    i32.store8 13($17), $13
+; NO-SIMD128-NEXT:    i32.store8 12($17), $12
+; NO-SIMD128-NEXT:    i32.store8 11($17), $11
+; NO-SIMD128-NEXT:    i32.store8 10($17), $10
+; NO-SIMD128-NEXT:    i32.store8 9($17), $9
+; NO-SIMD128-NEXT:    i32.store8 8($17), $8
+; NO-SIMD128-NEXT:    i32.store8 7($17), $7
+; NO-SIMD128-NEXT:    i32.store8 6($17), $6
+; NO-SIMD128-NEXT:    i32.store8 5($17), $5
+; NO-SIMD128-NEXT:    i32.store8 4($17), $4
+; NO-SIMD128-NEXT:    i32.store8 3($17), $3
+; NO-SIMD128-NEXT:    i32.store8 2($17), $2
+; NO-SIMD128-NEXT:    i32.store8 1($17), $1
+; NO-SIMD128-NEXT:    i32.store8 0($17), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.or $push2=, $17, $pop1
+; NO-SIMD128-NEXT:    i32.load8_u $push3=, 0($pop2)
+; NO-SIMD128-NEXT:    return $pop3
   %elem = extractelement <16 x i8> %v, i32 %i
   %a = zext i8 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_undef_v16i8_u:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype extract_undef_v16i8_u (v128) -> (i32){{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_undef_v16i8_u(<16 x i8> %v) {
+; SIMD128-LABEL: extract_undef_v16i8_u:
+; SIMD128:         .functype extract_undef_v16i8_u (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_undef_v16i8_u:
+; NO-SIMD128:         .functype extract_undef_v16i8_u (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    return $pop1
   %elem = extractelement <16 x i8> %v, i8 undef
   %a = zext i8 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype extract_v16i8 (v128) -> (i32){{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 13{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i8 @extract_v16i8(<16 x i8> %v) {
+; SIMD128-LABEL: extract_v16i8:
+; SIMD128:         .functype extract_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $0, 13
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v16i8:
+; NO-SIMD128:         .functype extract_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $13
   %elem = extractelement <16 x i8> %v, i8 13
   ret i8 %elem
 }
 
-; CHECK-LABEL: extract_var_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype extract_var_v16i8 (v128, i32) -> (i32){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $2, $pop[[L5]]{{$}}
-; SIMD128-NEXT: i32.load8_u $push[[R:[0-9]+]]=, 0($pop[[L6]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i8 @extract_var_v16i8(<16 x i8> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v16i8:
+; SIMD128:         .functype extract_var_v16i8 (v128, i32) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push4=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push5=, 16
+; SIMD128-NEXT:    i32.sub $push7=, $pop4, $pop5
+; SIMD128-NEXT:    local.tee $push6=, $2=, $pop7
+; SIMD128-NEXT:    v128.store 0($pop6), $0
+; SIMD128-NEXT:    i32.const $push0=, 15
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.or $push2=, $2, $pop1
+; SIMD128-NEXT:    i32.load8_u $push3=, 0($pop2)
+; SIMD128-NEXT:    return $pop3
+;
+; NO-SIMD128-LABEL: extract_var_v16i8:
+; NO-SIMD128:         .functype extract_var_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push4=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push5=, 16
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop4, $pop5
+; NO-SIMD128-NEXT:    local.tee $push6=, $17=, $pop7
+; NO-SIMD128-NEXT:    i32.store8 15($pop6), $15
+; NO-SIMD128-NEXT:    i32.store8 14($17), $14
+; NO-SIMD128-NEXT:    i32.store8 13($17), $13
+; NO-SIMD128-NEXT:    i32.store8 12($17), $12
+; NO-SIMD128-NEXT:    i32.store8 11($17), $11
+; NO-SIMD128-NEXT:    i32.store8 10($17), $10
+; NO-SIMD128-NEXT:    i32.store8 9($17), $9
+; NO-SIMD128-NEXT:    i32.store8 8($17), $8
+; NO-SIMD128-NEXT:    i32.store8 7($17), $7
+; NO-SIMD128-NEXT:    i32.store8 6($17), $6
+; NO-SIMD128-NEXT:    i32.store8 5($17), $5
+; NO-SIMD128-NEXT:    i32.store8 4($17), $4
+; NO-SIMD128-NEXT:    i32.store8 3($17), $3
+; NO-SIMD128-NEXT:    i32.store8 2($17), $2
+; NO-SIMD128-NEXT:    i32.store8 1($17), $1
+; NO-SIMD128-NEXT:    i32.store8 0($17), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.or $push2=, $17, $pop1
+; NO-SIMD128-NEXT:    i32.load8_u $push3=, 0($pop2)
+; NO-SIMD128-NEXT:    return $pop3
   %elem = extractelement <16 x i8> %v, i32 %i
   ret i8 %elem
 }
 
-; CHECK-LABEL: extract_undef_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype extract_undef_v16i8 (v128) -> (i32){{$}}
-; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i8 @extract_undef_v16i8(<16 x i8> %v) {
+; SIMD128-LABEL: extract_undef_v16i8:
+; SIMD128:         .functype extract_undef_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_undef_v16i8:
+; NO-SIMD128:         .functype extract_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $0
   %elem = extractelement <16 x i8> %v, i8 undef
   ret i8 %elem
 }
 
-; CHECK-LABEL: replace_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype replace_v16i8 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $0, 11, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
+; SIMD128-LABEL: replace_v16i8:
+; SIMD128:         .functype replace_v16i8 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.replace_lane $push0=, $0, 11, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_v16i8:
+; NO-SIMD128:         .functype replace_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 2($0), $3
+; NO-SIMD128-NEXT:    i32.store8 1($0), $2
+; NO-SIMD128-NEXT:    i32.store8 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
+; NO-SIMD128-NEXT:    i32.const $push2=, 14
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
+; NO-SIMD128-NEXT:    i32.const $push4=, 13
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
+; NO-SIMD128-NEXT:    i32.const $push6=, 12
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
+; NO-SIMD128-NEXT:    i32.const $push8=, 11
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.store8 0($pop9), $17
+; NO-SIMD128-NEXT:    i32.const $push10=, 10
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
+; NO-SIMD128-NEXT:    i32.const $push12=, 9
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
+; NO-SIMD128-NEXT:    i32.const $push14=, 7
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
+; NO-SIMD128-NEXT:    i32.const $push16=, 6
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
+; NO-SIMD128-NEXT:    i32.const $push20=, 3
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
+; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 11
   ret <16 x i8> %res
 }
 
-; CHECK-LABEL: replace_var_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype replace_var_v16i8 (v128, i32, i32) -> (v128){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $3, $pop[[L5]]{{$}}
-; SIMD128-NEXT: i32.store8 0($pop[[L6]]), $2{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @replace_var_v16i8(<16 x i8> %v, i32 %i, i8 %x) {
+; SIMD128-LABEL: replace_var_v16i8:
+; SIMD128:         .functype replace_var_v16i8 (v128, i32, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push4=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push5=, 16
+; SIMD128-NEXT:    i32.sub $push7=, $pop4, $pop5
+; SIMD128-NEXT:    local.tee $push6=, $3=, $pop7
+; SIMD128-NEXT:    v128.store 0($pop6), $0
+; SIMD128-NEXT:    i32.const $push0=, 15
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.or $push2=, $3, $pop1
+; SIMD128-NEXT:    i32.store8 0($pop2), $2
+; SIMD128-NEXT:    v128.load $push3=, 0($3)
+; SIMD128-NEXT:    return $pop3
+;
+; NO-SIMD128-LABEL: replace_var_v16i8:
+; NO-SIMD128:         .functype replace_var_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push5=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push6=, 16
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop5, $pop6
+; NO-SIMD128-NEXT:    local.tee $push7=, $19=, $pop8
+; NO-SIMD128-NEXT:    i32.store8 15($pop7), $16
+; NO-SIMD128-NEXT:    i32.store8 14($19), $15
+; NO-SIMD128-NEXT:    i32.store8 13($19), $14
+; NO-SIMD128-NEXT:    i32.store8 12($19), $13
+; NO-SIMD128-NEXT:    i32.store8 11($19), $12
+; NO-SIMD128-NEXT:    i32.store8 10($19), $11
+; NO-SIMD128-NEXT:    i32.store8 9($19), $10
+; NO-SIMD128-NEXT:    i32.store8 8($19), $9
+; NO-SIMD128-NEXT:    i32.store8 7($19), $8
+; NO-SIMD128-NEXT:    i32.store8 6($19), $7
+; NO-SIMD128-NEXT:    i32.store8 5($19), $6
+; NO-SIMD128-NEXT:    i32.store8 4($19), $5
+; NO-SIMD128-NEXT:    i32.store8 3($19), $4
+; NO-SIMD128-NEXT:    i32.store8 2($19), $3
+; NO-SIMD128-NEXT:    i32.store8 1($19), $2
+; NO-SIMD128-NEXT:    i32.store8 0($19), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.and $push1=, $17, $pop0
+; NO-SIMD128-NEXT:    i32.or $push2=, $19, $pop1
+; NO-SIMD128-NEXT:    i32.store8 0($pop2), $18
+; NO-SIMD128-NEXT:    i64.load $push3=, 8($19)
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i64.load $push4=, 0($19)
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop4
+; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 %i
   ret <16 x i8> %res
 }
 
-; CHECK-LABEL: replace_zero_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype replace_zero_v16i8 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @replace_zero_v16i8(<16 x i8> %v, i8 %x) {
+; SIMD128-LABEL: replace_zero_v16i8:
+; SIMD128:         .functype replace_zero_v16i8 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.replace_lane $push0=, $0, 0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_zero_v16i8:
+; NO-SIMD128:         .functype replace_zero_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 2($0), $3
+; NO-SIMD128-NEXT:    i32.store8 1($0), $2
+; NO-SIMD128-NEXT:    i32.store8 0($0), $17
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
+; NO-SIMD128-NEXT:    i32.const $push2=, 14
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
+; NO-SIMD128-NEXT:    i32.const $push4=, 13
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
+; NO-SIMD128-NEXT:    i32.const $push6=, 12
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
+; NO-SIMD128-NEXT:    i32.const $push8=, 11
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
+; NO-SIMD128-NEXT:    i32.const $push10=, 10
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
+; NO-SIMD128-NEXT:    i32.const $push12=, 9
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
+; NO-SIMD128-NEXT:    i32.const $push14=, 7
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
+; NO-SIMD128-NEXT:    i32.const $push16=, 6
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
+; NO-SIMD128-NEXT:    i32.const $push20=, 3
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
+; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 0
   ret <16 x i8> %res
 }
 
-; CHECK-LABEL: shuffle_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
-; SIMD128-SAME: 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: shuffle_v16i8:
+; SIMD128:         .functype shuffle_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $1, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_v16i8:
+; NO-SIMD128:         .functype shuffle_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 2($0), $3
+; NO-SIMD128-NEXT:    i32.store8 1($0), $18
+; NO-SIMD128-NEXT:    i32.store8 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store8 0($pop1), $32
+; NO-SIMD128-NEXT:    i32.const $push2=, 14
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
+; NO-SIMD128-NEXT:    i32.const $push4=, 13
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store8 0($pop5), $30
+; NO-SIMD128-NEXT:    i32.const $push6=, 12
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
+; NO-SIMD128-NEXT:    i32.const $push8=, 11
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.store8 0($pop9), $28
+; NO-SIMD128-NEXT:    i32.const $push10=, 10
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
+; NO-SIMD128-NEXT:    i32.const $push12=, 9
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $26
+; NO-SIMD128-NEXT:    i32.const $push14=, 7
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.store8 0($pop15), $24
+; NO-SIMD128-NEXT:    i32.const $push16=, 6
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $22
+; NO-SIMD128-NEXT:    i32.const $push20=, 3
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.store8 0($pop21), $20
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23,
                 i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   ret <16 x i8> %res
 }
 
-; CHECK-LABEL: shuffle_undef_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
-; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SIMD128-LABEL: shuffle_undef_v16i8:
+; SIMD128:         .functype shuffle_undef_v16i8 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_undef_v16i8:
+; NO-SIMD128:         .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 8($0), $2
+; NO-SIMD128-NEXT:    i32.store8 4($0), $2
+; NO-SIMD128-NEXT:    i32.store8 2($0), $2
+; NO-SIMD128-NEXT:    i32.store8 1($0), $2
+; NO-SIMD128-NEXT:    i32.store8 0($0), $2
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store8 0($pop1), $2
+; NO-SIMD128-NEXT:    i32.const $push2=, 14
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store8 0($pop3), $2
+; NO-SIMD128-NEXT:    i32.const $push4=, 13
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store8 0($pop5), $2
+; NO-SIMD128-NEXT:    i32.const $push6=, 12
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $2
+; NO-SIMD128-NEXT:    i32.const $push8=, 11
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.store8 0($pop9), $2
+; NO-SIMD128-NEXT:    i32.const $push10=, 10
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $2
+; NO-SIMD128-NEXT:    i32.const $push12=, 9
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $2
+; NO-SIMD128-NEXT:    i32.const $push14=, 7
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.store8 0($pop15), $2
+; NO-SIMD128-NEXT:    i32.const $push16=, 6
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $2
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $2
+; NO-SIMD128-NEXT:    i32.const $push20=, 3
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.store8 0($pop21), $2
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
                 i32 undef, i32 undef, i32 undef, i32 undef,
@@ -226,27 +616,70 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i8> %res
 }
 
-; CHECK-LABEL: build_v16i8:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype build_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.splat $push[[L0:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $2{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L3:[0-9]+]]=, $pop[[L2]], 3, $3{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L4:[0-9]+]]=, $pop[[L3]], 4, $4{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L5:[0-9]+]]=, $pop[[L4]], 5, $5{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L6:[0-9]+]]=, $pop[[L5]], 6, $6{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L7:[0-9]+]]=, $pop[[L6]], 7, $7{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L8:[0-9]+]]=, $pop[[L7]], 8, $8{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L9:[0-9]+]]=, $pop[[L8]], 9, $9{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L10:[0-9]+]]=, $pop[[L9]], 10, $10{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L11:[0-9]+]]=, $pop[[L10]], 11, $11{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L12:[0-9]+]]=, $pop[[L11]], 12, $12{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L13:[0-9]+]]=, $pop[[L12]], 13, $13{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[L14:[0-9]+]]=, $pop[[L13]], 14, $14{{$}}
-; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[L14]], 15, $15{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @build_v16i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3,
+; SIMD128-LABEL: build_v16i8:
+; SIMD128:         .functype build_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.splat $push0=, $0
+; SIMD128-NEXT:    i8x16.replace_lane $push1=, $pop0, 1, $1
+; SIMD128-NEXT:    i8x16.replace_lane $push2=, $pop1, 2, $2
+; SIMD128-NEXT:    i8x16.replace_lane $push3=, $pop2, 3, $3
+; SIMD128-NEXT:    i8x16.replace_lane $push4=, $pop3, 4, $4
+; SIMD128-NEXT:    i8x16.replace_lane $push5=, $pop4, 5, $5
+; SIMD128-NEXT:    i8x16.replace_lane $push6=, $pop5, 6, $6
+; SIMD128-NEXT:    i8x16.replace_lane $push7=, $pop6, 7, $7
+; SIMD128-NEXT:    i8x16.replace_lane $push8=, $pop7, 8, $8
+; SIMD128-NEXT:    i8x16.replace_lane $push9=, $pop8, 9, $9
+; SIMD128-NEXT:    i8x16.replace_lane $push10=, $pop9, 10, $10
+; SIMD128-NEXT:    i8x16.replace_lane $push11=, $pop10, 11, $11
+; SIMD128-NEXT:    i8x16.replace_lane $push12=, $pop11, 12, $12
+; SIMD128-NEXT:    i8x16.replace_lane $push13=, $pop12, 13, $13
+; SIMD128-NEXT:    i8x16.replace_lane $push14=, $pop13, 14, $14
+; SIMD128-NEXT:    i8x16.replace_lane $push15=, $pop14, 15, $15
+; SIMD128-NEXT:    return $pop15
+;
+; NO-SIMD128-LABEL: build_v16i8:
+; NO-SIMD128:         .functype build_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 2($0), $3
+; NO-SIMD128-NEXT:    i32.store8 1($0), $2
+; NO-SIMD128-NEXT:    i32.store8 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 15
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
+; NO-SIMD128-NEXT:    i32.const $push2=, 14
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
+; NO-SIMD128-NEXT:    i32.const $push4=, 13
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
+; NO-SIMD128-NEXT:    i32.const $push6=, 12
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
+; NO-SIMD128-NEXT:    i32.const $push8=, 11
+; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
+; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
+; NO-SIMD128-NEXT:    i32.const $push10=, 10
+; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
+; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
+; NO-SIMD128-NEXT:    i32.const $push12=, 9
+; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
+; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
+; NO-SIMD128-NEXT:    i32.const $push14=, 7
+; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
+; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
+; NO-SIMD128-NEXT:    i32.const $push16=, 6
+; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
+; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
+; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
+; NO-SIMD128-NEXT:    i32.const $push20=, 3
+; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
+; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
+; NO-SIMD128-NEXT:    return
                               i8 %x4, i8 %x5, i8 %x6, i8 %x7,
                               i8 %x8, i8 %x9, i8 %x10, i8 %x11,
                               i8 %x12, i8 %x13, i8 %x14, i8 %x15) {
@@ -272,239 +705,516 @@ define <16 x i8> @build_v16i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3,
 ; ==============================================================================
 ; 8 x i16
 ; ==============================================================================
-; CHECK-LABEL: const_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype const_v8i16 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 256, 770, 1284, 1798, 2312, 2826, 3340, 3854{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @const_v8i16() {
+; SIMD128-LABEL: const_v8i16:
+; SIMD128:         .functype const_v8i16 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 256, 770, 1284, 1798, 2312, 2826, 3340, 3854
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_v8i16:
+; NO-SIMD128:         .functype const_v8i16 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 1084818905618843912
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 506097522914230528
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <8 x i16> <i16 256, i16 770, i16 1284, i16 1798,
                  i16 2312, i16 2826, i16 3340, i16 3854>
 }
 
-; CHECK-LABEL: splat_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype splat_v8i16 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.splat $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @splat_v8i16(i16 %x) {
+; SIMD128-LABEL: splat_v8i16:
+; SIMD128:         .functype splat_v8i16 (i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.splat $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: splat_v8i16:
+; NO-SIMD128:         .functype splat_v8i16 (i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 8($0), $1
+; NO-SIMD128-NEXT:    i32.store16 4($0), $1
+; NO-SIMD128-NEXT:    i32.store16 2($0), $1
+; NO-SIMD128-NEXT:    i32.store16 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 14
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store16 0($pop1), $1
+; NO-SIMD128-NEXT:    i32.const $push2=, 12
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store16 0($pop3), $1
+; NO-SIMD128-NEXT:    i32.const $push4=, 10
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store16 0($pop5), $1
+; NO-SIMD128-NEXT:    i32.const $push6=, 6
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $1
+; NO-SIMD128-NEXT:    return
   %v = insertelement <8 x i16> undef, i16 %x, i32 0
   %res = shufflevector <8 x i16> %v, <8 x i16> undef,
     <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i16> %res
 }
 
-; CHECK-LABEL: const_splat_v8i16:
-; SIMD128: v128.const $push0=, 42, 42, 42, 42, 42, 42, 42, 42{{$}}
 define <8 x i16> @const_splat_v8i16() {
+; SIMD128-LABEL: const_splat_v8i16:
+; SIMD128:         .functype const_splat_v8i16 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 42, 42, 42, 42, 42, 42, 42, 42
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_splat_v8i16:
+; NO-SIMD128:         .functype const_splat_v8i16 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 11822129413226538
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 11822129413226538
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
 }
 
-; CHECK-LABEL: extract_v8i16_s:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extract_v8i16_s (v128) -> (i32){{$}}
-; SIMD128-NEXT: i16x8.extract_lane_s $push[[R:[0-9]+]]=, $0, 5{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v8i16_s(<8 x i16> %v) {
+; SIMD128-LABEL: extract_v8i16_s:
+; SIMD128:         .functype extract_v8i16_s (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extract_lane_s $push0=, $0, 5
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v8i16_s:
+; NO-SIMD128:         .functype extract_v8i16_s (i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $5
+; NO-SIMD128-NEXT:    return $pop0
   %elem = extractelement <8 x i16> %v, i16 5
   %a = sext i16 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_var_v8i16_s:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extract_var_v8i16_s (v128, i32) -> (i32){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $2, $pop[[L7]]{{$}}
-; SIMD128-NEXT: i32.load16_s $push[[R:[0-9]+]]=, 0($pop[[L8]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_var_v8i16_s(<8 x i16> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v8i16_s:
+; SIMD128:         .functype extract_var_v8i16_s (v128, i32) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $2=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 7
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 1
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $2, $pop3
+; SIMD128-NEXT:    i32.load16_s $push5=, 0($pop4)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: extract_var_v8i16_s:
+; NO-SIMD128:         .functype extract_var_v8i16_s (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push7=, 16
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; NO-SIMD128-NEXT:    local.tee $push8=, $9=, $pop9
+; NO-SIMD128-NEXT:    i32.store16 14($pop8), $7
+; NO-SIMD128-NEXT:    i32.store16 12($9), $6
+; NO-SIMD128-NEXT:    i32.store16 10($9), $5
+; NO-SIMD128-NEXT:    i32.store16 8($9), $4
+; NO-SIMD128-NEXT:    i32.store16 6($9), $3
+; NO-SIMD128-NEXT:    i32.store16 4($9), $2
+; NO-SIMD128-NEXT:    i32.store16 2($9), $1
+; NO-SIMD128-NEXT:    i32.store16 0($9), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 7
+; NO-SIMD128-NEXT:    i32.and $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 1
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $9, $pop3
+; NO-SIMD128-NEXT:    i32.load16_s $push5=, 0($pop4)
+; NO-SIMD128-NEXT:    return $pop5
   %elem = extractelement <8 x i16> %v, i32 %i
   %a = sext i16 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_undef_v8i16_s:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extract_undef_v8i16_s (v128) -> (i32){{$}}
-; SIMD128-NEXT: i16x8.extract_lane_s $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_undef_v8i16_s(<8 x i16> %v) {
+; SIMD128-LABEL: extract_undef_v8i16_s:
+; SIMD128:         .functype extract_undef_v8i16_s (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extract_lane_s $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_undef_v8i16_s:
+; NO-SIMD128:         .functype extract_undef_v8i16_s (i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $0
+; NO-SIMD128-NEXT:    return $pop0
   %elem = extractelement <8 x i16> %v, i16 undef
   %a = sext i16 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_v8i16_u:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extract_v8i16_u (v128) -> (i32){{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 5{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v8i16_u(<8 x i16> %v) {
+; SIMD128-LABEL: extract_v8i16_u:
+; SIMD128:         .functype extract_v8i16_u (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extract_lane_u $push0=, $0, 5
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v8i16_u:
+; NO-SIMD128:         .functype extract_v8i16_u (i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $5, $pop0
+; NO-SIMD128-NEXT:    return $pop1
   %elem = extractelement <8 x i16> %v, i16 5
   %a = zext i16 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_var_v8i16_u:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extract_var_v8i16_u (v128, i32) -> (i32){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $2, $pop[[L7]]{{$}}
-; SIMD128-NEXT: i32.load16_u $push[[R:[0-9]+]]=, 0($pop[[L8]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_var_v8i16_u(<8 x i16> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v8i16_u:
+; SIMD128:         .functype extract_var_v8i16_u (v128, i32) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $2=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 7
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 1
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $2, $pop3
+; SIMD128-NEXT:    i32.load16_u $push5=, 0($pop4)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: extract_var_v8i16_u:
+; NO-SIMD128:         .functype extract_var_v8i16_u (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push7=, 16
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; NO-SIMD128-NEXT:    local.tee $push8=, $9=, $pop9
+; NO-SIMD128-NEXT:    i32.store16 14($pop8), $7
+; NO-SIMD128-NEXT:    i32.store16 12($9), $6
+; NO-SIMD128-NEXT:    i32.store16 10($9), $5
+; NO-SIMD128-NEXT:    i32.store16 8($9), $4
+; NO-SIMD128-NEXT:    i32.store16 6($9), $3
+; NO-SIMD128-NEXT:    i32.store16 4($9), $2
+; NO-SIMD128-NEXT:    i32.store16 2($9), $1
+; NO-SIMD128-NEXT:    i32.store16 0($9), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 7
+; NO-SIMD128-NEXT:    i32.and $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 1
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $9, $pop3
+; NO-SIMD128-NEXT:    i32.load16_u $push5=, 0($pop4)
+; NO-SIMD128-NEXT:    return $pop5
   %elem = extractelement <8 x i16> %v, i32 %i
   %a = zext i16 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_undef_v8i16_u:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extract_undef_v8i16_u (v128) -> (i32){{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_undef_v8i16_u(<8 x i16> %v) {
+; SIMD128-LABEL: extract_undef_v8i16_u:
+; SIMD128:         .functype extract_undef_v8i16_u (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extract_lane_u $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_undef_v8i16_u:
+; NO-SIMD128:         .functype extract_undef_v8i16_u (i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    return $pop1
   %elem = extractelement <8 x i16> %v, i16 undef
   %a = zext i16 %elem to i32
   ret i32 %a
 }
 
-; CHECK-LABEL: extract_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extract_v8i16 (v128) -> (i32){{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 5{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i16 @extract_v8i16(<8 x i16> %v) {
+; SIMD128-LABEL: extract_v8i16:
+; SIMD128:         .functype extract_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extract_lane_u $push0=, $0, 5
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v8i16:
+; NO-SIMD128:         .functype extract_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $5
   %elem = extractelement <8 x i16> %v, i16 5
   ret i16 %elem
 }
 
-; CHECK-LABEL: extract_var_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extract_var_v8i16 (v128, i32) -> (i32){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $2, $pop[[L7]]{{$}}
-; SIMD128-NEXT: i32.load16_u $push[[R:[0-9]+]]=, 0($pop[[L8]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i16 @extract_var_v8i16(<8 x i16> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v8i16:
+; SIMD128:         .functype extract_var_v8i16 (v128, i32) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $2=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 7
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 1
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $2, $pop3
+; SIMD128-NEXT:    i32.load16_u $push5=, 0($pop4)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: extract_var_v8i16:
+; NO-SIMD128:         .functype extract_var_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push7=, 16
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; NO-SIMD128-NEXT:    local.tee $push8=, $9=, $pop9
+; NO-SIMD128-NEXT:    i32.store16 14($pop8), $7
+; NO-SIMD128-NEXT:    i32.store16 12($9), $6
+; NO-SIMD128-NEXT:    i32.store16 10($9), $5
+; NO-SIMD128-NEXT:    i32.store16 8($9), $4
+; NO-SIMD128-NEXT:    i32.store16 6($9), $3
+; NO-SIMD128-NEXT:    i32.store16 4($9), $2
+; NO-SIMD128-NEXT:    i32.store16 2($9), $1
+; NO-SIMD128-NEXT:    i32.store16 0($9), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 7
+; NO-SIMD128-NEXT:    i32.and $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 1
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $9, $pop3
+; NO-SIMD128-NEXT:    i32.load16_u $push5=, 0($pop4)
+; NO-SIMD128-NEXT:    return $pop5
   %elem = extractelement <8 x i16> %v, i32 %i
   ret i16 %elem
 }
 
-; CHECK-LABEL: extract_undef_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype extract_undef_v8i16 (v128) -> (i32){{$}}
-; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i16 @extract_undef_v8i16(<8 x i16> %v) {
+; SIMD128-LABEL: extract_undef_v8i16:
+; SIMD128:         .functype extract_undef_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extract_lane_u $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_undef_v8i16:
+; NO-SIMD128:         .functype extract_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $0
   %elem = extractelement <8 x i16> %v, i16 undef
   ret i16 %elem
 }
 
-; CHECK-LABEL: replace_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype replace_v8i16 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $0, 7, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
+; SIMD128-LABEL: replace_v8i16:
+; SIMD128:         .functype replace_v8i16 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.replace_lane $push0=, $0, 7, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_v8i16:
+; NO-SIMD128:         .functype replace_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 4($0), $3
+; NO-SIMD128-NEXT:    i32.store16 2($0), $2
+; NO-SIMD128-NEXT:    i32.store16 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 14
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store16 0($pop1), $9
+; NO-SIMD128-NEXT:    i32.const $push2=, 12
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
+; NO-SIMD128-NEXT:    i32.const $push4=, 10
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
+; NO-SIMD128-NEXT:    i32.const $push6=, 6
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
+; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 7
   ret <8 x i16> %res
 }
 
-; CHECK-LABEL: replace_var_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype replace_var_v8i16 (v128, i32, i32) -> (v128){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $3, $pop[[L7]]{{$}}
-; SIMD128-NEXT: i32.store16 0($pop[[L8]]), $2{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @replace_var_v8i16(<8 x i16> %v, i32 %i, i16 %x) {
+; SIMD128-LABEL: replace_var_v8i16:
+; SIMD128:         .functype replace_var_v8i16 (v128, i32, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $3=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 7
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 1
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $3, $pop3
+; SIMD128-NEXT:    i32.store16 0($pop4), $2
+; SIMD128-NEXT:    v128.load $push5=, 0($3)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: replace_var_v8i16:
+; NO-SIMD128:         .functype replace_var_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push7=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push8=, 16
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop7, $pop8
+; NO-SIMD128-NEXT:    local.tee $push9=, $11=, $pop10
+; NO-SIMD128-NEXT:    i32.store16 14($pop9), $8
+; NO-SIMD128-NEXT:    i32.store16 12($11), $7
+; NO-SIMD128-NEXT:    i32.store16 10($11), $6
+; NO-SIMD128-NEXT:    i32.store16 8($11), $5
+; NO-SIMD128-NEXT:    i32.store16 6($11), $4
+; NO-SIMD128-NEXT:    i32.store16 4($11), $3
+; NO-SIMD128-NEXT:    i32.store16 2($11), $2
+; NO-SIMD128-NEXT:    i32.store16 0($11), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 7
+; NO-SIMD128-NEXT:    i32.and $push1=, $9, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 1
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $11, $pop3
+; NO-SIMD128-NEXT:    i32.store16 0($pop4), $10
+; NO-SIMD128-NEXT:    i64.load $push5=, 8($11)
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i64.load $push6=, 0($11)
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop6
+; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 %i
   ret <8 x i16> %res
 }
 
-; CHECK-LABEL: replace_zero_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype replace_zero_v8i16 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @replace_zero_v8i16(<8 x i16> %v, i16 %x) {
+; SIMD128-LABEL: replace_zero_v8i16:
+; SIMD128:         .functype replace_zero_v8i16 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.replace_lane $push0=, $0, 0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_zero_v8i16:
+; NO-SIMD128:         .functype replace_zero_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 4($0), $3
+; NO-SIMD128-NEXT:    i32.store16 2($0), $2
+; NO-SIMD128-NEXT:    i32.store16 0($0), $9
+; NO-SIMD128-NEXT:    i32.const $push0=, 14
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
+; NO-SIMD128-NEXT:    i32.const $push2=, 12
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
+; NO-SIMD128-NEXT:    i32.const $push4=, 10
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
+; NO-SIMD128-NEXT:    i32.const $push6=, 6
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
+; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 0
   ret <8 x i16> %res
 }
 
-; CHECK-LABEL: shuffle_v8i16:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
-; SIMD128-SAME: 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shuffle_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: shuffle_v8i16:
+; SIMD128:         .functype shuffle_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $1, 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_v8i16:
+; NO-SIMD128:         .functype shuffle_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 4($0), $3
+; NO-SIMD128-NEXT:    i32.store16 2($0), $10
+; NO-SIMD128-NEXT:    i32.store16 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 14
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store16 0($pop1), $16
+; NO-SIMD128-NEXT:    i32.const $push2=, 12
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
+; NO-SIMD128-NEXT:    i32.const $push4=, 10
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store16 0($pop5), $14
+; NO-SIMD128-NEXT:    i32.const $push6=, 6
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $12
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x i16> %res
 }
 
-; CHECK-LABEL: shuffle_undef_v8i16:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_undef_v8i16 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
-; SIMD128-SAME: 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SIMD128-LABEL: shuffle_undef_v8i16:
+; SIMD128:         .functype shuffle_undef_v8i16 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_undef_v8i16:
+; NO-SIMD128:         .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 8($0), $2
+; NO-SIMD128-NEXT:    i32.store16 4($0), $2
+; NO-SIMD128-NEXT:    i32.store16 2($0), $2
+; NO-SIMD128-NEXT:    i32.store16 0($0), $2
+; NO-SIMD128-NEXT:    i32.const $push0=, 14
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store16 0($pop1), $2
+; NO-SIMD128-NEXT:    i32.const $push2=, 12
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store16 0($pop3), $2
+; NO-SIMD128-NEXT:    i32.const $push4=, 10
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store16 0($pop5), $2
+; NO-SIMD128-NEXT:    i32.const $push6=, 6
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $2
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
                i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %res
 }
 
-; CHECK-LABEL: build_v8i16:
-; NO-SIMD128-NOT: i16x8
-; SIMD128-NEXT: .functype build_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.splat $push[[L0:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $2{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[L3:[0-9]+]]=, $pop[[L2]], 3, $3{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[L4:[0-9]+]]=, $pop[[L3]], 4, $4{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[L5:[0-9]+]]=, $pop[[L4]], 5, $5{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[L6:[0-9]+]]=, $pop[[L5]], 6, $6{{$}}
-; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[L6]], 7, $7{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @build_v8i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3,
+; SIMD128-LABEL: build_v8i16:
+; SIMD128:         .functype build_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.splat $push0=, $0
+; SIMD128-NEXT:    i16x8.replace_lane $push1=, $pop0, 1, $1
+; SIMD128-NEXT:    i16x8.replace_lane $push2=, $pop1, 2, $2
+; SIMD128-NEXT:    i16x8.replace_lane $push3=, $pop2, 3, $3
+; SIMD128-NEXT:    i16x8.replace_lane $push4=, $pop3, 4, $4
+; SIMD128-NEXT:    i16x8.replace_lane $push5=, $pop4, 5, $5
+; SIMD128-NEXT:    i16x8.replace_lane $push6=, $pop5, 6, $6
+; SIMD128-NEXT:    i16x8.replace_lane $push7=, $pop6, 7, $7
+; SIMD128-NEXT:    return $pop7
+;
+; NO-SIMD128-LABEL: build_v8i16:
+; NO-SIMD128:         .functype build_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 4($0), $3
+; NO-SIMD128-NEXT:    i32.store16 2($0), $2
+; NO-SIMD128-NEXT:    i32.store16 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 14
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
+; NO-SIMD128-NEXT:    i32.const $push2=, 12
+; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
+; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
+; NO-SIMD128-NEXT:    i32.const $push4=, 10
+; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
+; NO-SIMD128-NEXT:    i32.const $push6=, 6
+; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
+; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
+; NO-SIMD128-NEXT:    return
                               i16 %x4, i16 %x5, i16 %x6, i16 %x7) {
   %t0 = insertelement <8 x i16> undef, i16 %x0, i32 0
   %t1 = insertelement <8 x i16> %t0, i16 %x1, i32 1
@@ -520,147 +1230,284 @@ define <8 x i16> @build_v8i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3,
 ; ==============================================================================
 ; 4 x i32
 ; ==============================================================================
-; CHECK-LABEL: const_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype const_v4i32 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 50462976, 117835012, 185207048, 252579084{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @const_v4i32() {
+; SIMD128-LABEL: const_v4i32:
+; SIMD128:         .functype const_v4i32 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 50462976, 117835012, 185207048, 252579084
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_v4i32:
+; NO-SIMD128:         .functype const_v4i32 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 1084818905618843912
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 506097522914230528
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <4 x i32> <i32 50462976, i32 117835012, i32 185207048, i32 252579084>
 }
 
-; CHECK-LABEL: splat_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype splat_v4i32 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.splat $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @splat_v4i32(i32 %x) {
+; SIMD128-LABEL: splat_v4i32:
+; SIMD128:         .functype splat_v4i32 (i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.splat $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: splat_v4i32:
+; NO-SIMD128:         .functype splat_v4i32 (i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 8($0), $1
+; NO-SIMD128-NEXT:    i32.store 4($0), $1
+; NO-SIMD128-NEXT:    i32.store 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store 0($pop1), $1
+; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x i32> undef, i32 %x, i32 0
   %res = shufflevector <4 x i32> %v, <4 x i32> undef,
     <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i32> %res
 }
 
-; CHECK-LABEL: const_splat_v4i32:
-; SIMD128: v128.const $push0=, 42, 42, 42, 42{{$}}
 define <4 x i32> @const_splat_v4i32() {
+; SIMD128-LABEL: const_splat_v4i32:
+; SIMD128:         .functype const_splat_v4i32 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 42, 42, 42, 42
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_splat_v4i32:
+; NO-SIMD128:         .functype const_splat_v4i32 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 180388626474
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 180388626474
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <4 x i32> <i32 42, i32 42, i32 42, i32 42>
 }
 
-; CHECK-LABEL: extract_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype extract_v4i32 (v128) -> (i32){{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[R:[0-9]+]]=, $0, 3{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_v4i32(<4 x i32> %v) {
+; SIMD128-LABEL: extract_v4i32:
+; SIMD128:         .functype extract_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v4i32:
+; NO-SIMD128:         .functype extract_v4i32 (i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $3
   %elem = extractelement <4 x i32> %v, i32 3
   ret i32 %elem
 }
 
-; CHECK-LABEL: extract_var_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype extract_var_v4i32 (v128, i32) -> (i32){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 3{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L4:[0-9]+]]=, $2, $pop[[L7]]{{$}}
-; SIMD128-NEXT: i32.load $push[[R:[0-9]+]]=, 0($pop[[L4]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_var_v4i32(<4 x i32> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v4i32:
+; SIMD128:         .functype extract_var_v4i32 (v128, i32) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $2=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 3
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 2
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $2, $pop3
+; SIMD128-NEXT:    i32.load $push5=, 0($pop4)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: extract_var_v4i32:
+; NO-SIMD128:         .functype extract_var_v4i32 (i32, i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push7=, 16
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; NO-SIMD128-NEXT:    local.tee $push8=, $5=, $pop9
+; NO-SIMD128-NEXT:    i32.store 12($pop8), $3
+; NO-SIMD128-NEXT:    i32.store 8($5), $2
+; NO-SIMD128-NEXT:    i32.store 4($5), $1
+; NO-SIMD128-NEXT:    i32.store 0($5), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 3
+; NO-SIMD128-NEXT:    i32.and $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $5, $pop3
+; NO-SIMD128-NEXT:    i32.load $push5=, 0($pop4)
+; NO-SIMD128-NEXT:    return $pop5
   %elem = extractelement <4 x i32> %v, i32 %i
   ret i32 %elem
 }
 
-; CHECK-LABEL: extract_zero_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype extract_zero_v4i32 (v128) -> (i32){{$}}
-; SIMD128-NEXT: i32x4.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i32 @extract_zero_v4i32(<4 x i32> %v) {
+; SIMD128-LABEL: extract_zero_v4i32:
+; SIMD128:         .functype extract_zero_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extract_lane $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_zero_v4i32:
+; NO-SIMD128:         .functype extract_zero_v4i32 (i32, i32, i32, i32) -> (i32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $0
   %elem = extractelement <4 x i32> %v, i32 0
   ret i32 %elem
 }
 
-; CHECK-LABEL: replace_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype replace_v4i32 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $0, 2, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
+; SIMD128-LABEL: replace_v4i32:
+; SIMD128:         .functype replace_v4i32 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.replace_lane $push0=, $0, 2, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_v4i32:
+; NO-SIMD128:         .functype replace_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 8($0), $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $2
+; NO-SIMD128-NEXT:    i32.store 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
+; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 2
   ret <4 x i32> %res
 }
 
-; CHECK-LABEL: replace_var_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype replace_var_v4i32 (v128, i32, i32) -> (v128){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 3{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L4:[0-9]+]]=, $3, $pop[[L7]]{{$}}
-; SIMD128-NEXT: i32.store 0($pop[[L4]]), $2{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @replace_var_v4i32(<4 x i32> %v, i32 %i, i32 %x) {
+; SIMD128-LABEL: replace_var_v4i32:
+; SIMD128:         .functype replace_var_v4i32 (v128, i32, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $3=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 3
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 2
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $3, $pop3
+; SIMD128-NEXT:    i32.store 0($pop4), $2
+; SIMD128-NEXT:    v128.load $push5=, 0($3)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: replace_var_v4i32:
+; NO-SIMD128:         .functype replace_var_v4i32 (i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push7=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push8=, 16
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop7, $pop8
+; NO-SIMD128-NEXT:    local.tee $push9=, $7=, $pop10
+; NO-SIMD128-NEXT:    i32.store 12($pop9), $4
+; NO-SIMD128-NEXT:    i32.store 8($7), $3
+; NO-SIMD128-NEXT:    i32.store 4($7), $2
+; NO-SIMD128-NEXT:    i32.store 0($7), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 3
+; NO-SIMD128-NEXT:    i32.and $push1=, $5, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    i32.store 0($pop4), $6
+; NO-SIMD128-NEXT:    i64.load $push5=, 8($7)
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i64.load $push6=, 0($7)
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop6
+; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 %i
   ret <4 x i32> %res
 }
 
-; CHECK-LABEL: replace_zero_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype replace_zero_v4i32 (v128, i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @replace_zero_v4i32(<4 x i32> %v, i32 %x) {
+; SIMD128-LABEL: replace_zero_v4i32:
+; SIMD128:         .functype replace_zero_v4i32 (v128, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.replace_lane $push0=, $0, 0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_zero_v4i32:
+; NO-SIMD128:         .functype replace_zero_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 8($0), $3
+; NO-SIMD128-NEXT:    i32.store 4($0), $2
+; NO-SIMD128-NEXT:    i32.store 0($0), $5
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
+; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 0
   ret <4 x i32> %res
 }
 
-; CHECK-LABEL: shuffle_v4i32:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
-; SIMD128-SAME: 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shuffle_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: shuffle_v4i32:
+; SIMD128:         .functype shuffle_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $1, 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_v4i32:
+; NO-SIMD128:         .functype shuffle_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 8($0), $3
+; NO-SIMD128-NEXT:    i32.store 4($0), $6
+; NO-SIMD128-NEXT:    i32.store 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store 0($pop1), $8
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x i32> %res
 }
 
-; CHECK-LABEL: shuffle_undef_v4i32:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_undef_v4i32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
-; SIMD128-SAME: 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SIMD128-LABEL: shuffle_undef_v4i32:
+; SIMD128:         .functype shuffle_undef_v4i32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_undef_v4i32:
+; NO-SIMD128:         .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 8($0), $2
+; NO-SIMD128-NEXT:    i32.store 4($0), $2
+; NO-SIMD128-NEXT:    i32.store 0($0), $2
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store 0($pop1), $2
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   ret <4 x i32> %res
 }
 
-; CHECK-LABEL: build_v4i32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype build_v4i32 (i32, i32, i32, i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.splat $push[[L0:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: i32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
-; SIMD128-NEXT: i32x4.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $2{{$}}
-; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L2]], 3, $3{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @build_v4i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
+; SIMD128-LABEL: build_v4i32:
+; SIMD128:         .functype build_v4i32 (i32, i32, i32, i32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.splat $push0=, $0
+; SIMD128-NEXT:    i32x4.replace_lane $push1=, $pop0, 1, $1
+; SIMD128-NEXT:    i32x4.replace_lane $push2=, $pop1, 2, $2
+; SIMD128-NEXT:    i32x4.replace_lane $push3=, $pop2, 3, $3
+; SIMD128-NEXT:    return $pop3
+;
+; NO-SIMD128-LABEL: build_v4i32:
+; NO-SIMD128:         .functype build_v4i32 (i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 8($0), $3
+; NO-SIMD128-NEXT:    i32.store 4($0), $2
+; NO-SIMD128-NEXT:    i32.store 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
+; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x i32> undef, i32 %x0, i32 0
   %t1 = insertelement <4 x i32> %t0, i32 %x1, i32 1
   %t2 = insertelement <4 x i32> %t1, i32 %x2, i32 2
@@ -671,143 +1518,252 @@ define <4 x i32> @build_v4i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; ==============================================================================
 ; 2 x i64
 ; ==============================================================================
-; CHECK-LABEL: const_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype const_v2i64 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 506097522914230528, 1084818905618843912{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @const_v2i64() {
+; SIMD128-LABEL: const_v2i64:
+; SIMD128:         .functype const_v2i64 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 506097522914230528, 1084818905618843912
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_v2i64:
+; NO-SIMD128:         .functype const_v2i64 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 1084818905618843912
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 506097522914230528
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <2 x i64> <i64 506097522914230528, i64 1084818905618843912>
 }
 
-; CHECK-LABEL: splat_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype splat_v2i64 (i64) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.splat $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @splat_v2i64(i64 %x) {
+; SIMD128-LABEL: splat_v2i64:
+; SIMD128:         .functype splat_v2i64 (i64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.splat $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: splat_v2i64:
+; NO-SIMD128:         .functype splat_v2i64 (i32, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.store 8($0), $1
+; NO-SIMD128-NEXT:    i64.store 0($0), $1
+; NO-SIMD128-NEXT:    return
   %t1 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
   %res = insertelement <2 x i64> %t1, i64 %x, i32 1
   ret <2 x i64> %res
 }
 
-; CHECK-LABEL: const_splat_v2i64:
-; SIMD128: v128.const $push0=, 42, 42{{$}}
 define <2 x i64> @const_splat_v2i64() {
+; SIMD128-LABEL: const_splat_v2i64:
+; SIMD128:         .functype const_splat_v2i64 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 42, 42
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_splat_v2i64:
+; NO-SIMD128:         .functype const_splat_v2i64 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 42
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 42
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <2 x i64> <i64 42, i64 42>
 }
 
-; CHECK-LABEL: extract_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype extract_v2i64 (v128) -> (i64){{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[R:[0-9]+]]=, $0, 1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i64 @extract_v2i64(<2 x i64> %v) {
+; SIMD128-LABEL: extract_v2i64:
+; SIMD128:         .functype extract_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v2i64:
+; NO-SIMD128:         .functype extract_v2i64 (i64, i64) -> (i64)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $1
   %elem = extractelement <2 x i64> %v, i64 1
   ret i64 %elem
 }
 
-; CHECK-LABEL: extract_var_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype extract_var_v2i64 (v128, i32) -> (i64){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $2, $pop[[L7]]{{$}}
-; SIMD128-NEXT: i64.load $push[[R:[0-9]+]]=, 0($pop[[L2]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i64 @extract_var_v2i64(<2 x i64> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v2i64:
+; SIMD128:         .functype extract_var_v2i64 (v128, i32) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $2=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 1
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 3
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $2, $pop3
+; SIMD128-NEXT:    i64.load $push5=, 0($pop4)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: extract_var_v2i64:
+; NO-SIMD128:         .functype extract_var_v2i64 (i64, i64, i32) -> (i64)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push7=, 16
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; NO-SIMD128-NEXT:    local.tee $push8=, $3=, $pop9
+; NO-SIMD128-NEXT:    i64.store 8($pop8), $1
+; NO-SIMD128-NEXT:    i64.store 0($3), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 1
+; NO-SIMD128-NEXT:    i32.and $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 3
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $3, $pop3
+; NO-SIMD128-NEXT:    i64.load $push5=, 0($pop4)
+; NO-SIMD128-NEXT:    return $pop5
   %elem = extractelement <2 x i64> %v, i32 %i
   ret i64 %elem
 }
 
-; CHECK-LABEL: extract_zero_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype extract_zero_v2i64 (v128) -> (i64){{$}}
-; SIMD128-NEXT: i64x2.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define i64 @extract_zero_v2i64(<2 x i64> %v) {
+; SIMD128-LABEL: extract_zero_v2i64:
+; SIMD128:         .functype extract_zero_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extract_lane $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_zero_v2i64:
+; NO-SIMD128:         .functype extract_zero_v2i64 (i64, i64) -> (i64)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $0
   %elem = extractelement <2 x i64> %v, i64 0
   ret i64 %elem
 }
 
-; CHECK-LABEL: replace_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype replace_v2i64 (v128, i64) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @replace_v2i64(<2 x i64> %v, i64 %x) {
+; SIMD128-LABEL: replace_v2i64:
+; SIMD128:         .functype replace_v2i64 (v128, i64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.replace_lane $push0=, $0, 0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_v2i64:
+; NO-SIMD128:         .functype replace_v2i64 (i32, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.store 8($0), $2
+; NO-SIMD128-NEXT:    i64.store 0($0), $3
+; NO-SIMD128-NEXT:    return
   %res = insertelement <2 x i64> %v, i64 %x, i32 0
   ret <2 x i64> %res
 }
 
-; CHECK-LABEL: replace_var_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype replace_var_v2i64 (v128, i32, i64) -> (v128){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $3, $pop[[L7]]{{$}}
-; SIMD128-NEXT: i64.store 0($pop[[L2]]), $2{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @replace_var_v2i64(<2 x i64> %v, i32 %i, i64 %x) {
+; SIMD128-LABEL: replace_var_v2i64:
+; SIMD128:         .functype replace_var_v2i64 (v128, i32, i64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $3=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 1
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 3
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $3, $pop3
+; SIMD128-NEXT:    i64.store 0($pop4), $2
+; SIMD128-NEXT:    v128.load $push5=, 0($3)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: replace_var_v2i64:
+; NO-SIMD128:         .functype replace_var_v2i64 (i32, i64, i64, i32, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push7=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push8=, 16
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop7, $pop8
+; NO-SIMD128-NEXT:    local.tee $push9=, $5=, $pop10
+; NO-SIMD128-NEXT:    i64.store 8($pop9), $2
+; NO-SIMD128-NEXT:    i64.store 0($5), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 1
+; NO-SIMD128-NEXT:    i32.and $push1=, $3, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 3
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $5, $pop3
+; NO-SIMD128-NEXT:    i64.store 0($pop4), $4
+; NO-SIMD128-NEXT:    i64.load $push5=, 8($5)
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i64.load $push6=, 0($5)
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop6
+; NO-SIMD128-NEXT:    return
   %res = insertelement <2 x i64> %v, i64 %x, i32 %i
   ret <2 x i64> %res
 }
 
-; CHECK-LABEL: replace_zero_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype replace_zero_v2i64 (v128, i64) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @replace_zero_v2i64(<2 x i64> %v, i64 %x) {
+; SIMD128-LABEL: replace_zero_v2i64:
+; SIMD128:         .functype replace_zero_v2i64 (v128, i64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.replace_lane $push0=, $0, 0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_zero_v2i64:
+; NO-SIMD128:         .functype replace_zero_v2i64 (i32, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.store 8($0), $2
+; NO-SIMD128-NEXT:    i64.store 0($0), $3
+; NO-SIMD128-NEXT:    return
   %res = insertelement <2 x i64> %v, i64 %x, i32 0
   ret <2 x i64> %res
 }
 
-; CHECK-LABEL: shuffle_v2i64:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
-; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shuffle_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: shuffle_v2i64:
+; SIMD128:         .functype shuffle_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $1, 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_v2i64:
+; NO-SIMD128:         .functype shuffle_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.store 8($0), $4
+; NO-SIMD128-NEXT:    i64.store 0($0), $1
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %res
 }
 
-; CHECK-LABEL: shuffle_undef_v2i64:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_undef_v2i64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
-; SIMD128-SAME: 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SIMD128-LABEL: shuffle_undef_v2i64:
+; SIMD128:         .functype shuffle_undef_v2i64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_undef_v2i64:
+; NO-SIMD128:         .functype shuffle_undef_v2i64 (i32, i64, i64, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.store 8($0), $2
+; NO-SIMD128-NEXT:    i64.store 0($0), $2
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <2 x i64> %x, <2 x i64> %y,
     <2 x i32> <i32 1, i32 undef>
   ret <2 x i64> %res
 }
 
-; CHECK-LABEL: build_v2i64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype build_v2i64 (i64, i64) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.splat $push[[L0:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @build_v2i64(i64 %x0, i64 %x1) {
+; SIMD128-LABEL: build_v2i64:
+; SIMD128:         .functype build_v2i64 (i64, i64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.splat $push0=, $0
+; SIMD128-NEXT:    i64x2.replace_lane $push1=, $pop0, 1, $1
+; SIMD128-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: build_v2i64:
+; NO-SIMD128:         .functype build_v2i64 (i32, i64, i64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.store 8($0), $2
+; NO-SIMD128-NEXT:    i64.store 0($0), $1
+; NO-SIMD128-NEXT:    return
   %t0 = insertelement <2 x i64> undef, i64 %x0, i32 0
   %res = insertelement <2 x i64> %t0, i64 %x1, i32 1
   ret <2 x i64> %res
@@ -816,149 +1772,285 @@ define <2 x i64> @build_v2i64(i64 %x0, i64 %x1) {
 ; ==============================================================================
 ; 4 x f32
 ; ==============================================================================
-; CHECK-LABEL: const_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype const_v4f32 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=,
-; SIMD128-SAME: 0x1.0402p-121, 0x1.0c0a08p-113, 0x1.14121p-105, 0x1.1c1a18p-97{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @const_v4f32() {
+; SIMD128-LABEL: const_v4f32:
+; SIMD128:         .functype const_v4f32 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.0402p-121, 0x1.0c0a08p-113, 0x1.14121p-105, 0x1.1c1a18p-97
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_v4f32:
+; NO-SIMD128:         .functype const_v4f32 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 1084818905618843912
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 506097522914230528
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <4 x float> <float 0x3860402000000000, float 0x38e0c0a080000000,
                    float 0x3961412100000000, float 0x39e1c1a180000000>
 }
 
-; CHECK-LABEL: splat_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype splat_v4f32 (f32) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @splat_v4f32(float %x) {
+; SIMD128-LABEL: splat_v4f32:
+; SIMD128:         .functype splat_v4f32 (f32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.splat $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: splat_v4f32:
+; NO-SIMD128:         .functype splat_v4f32 (i32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 8($0), $1
+; NO-SIMD128-NEXT:    f32.store 4($0), $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    f32.store 0($pop1), $1
+; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x float> undef, float %x, i32 0
   %res = shufflevector <4 x float> %v, <4 x float> undef,
     <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   ret <4 x float> %res
 }
 
-; CHECK-LABEL: const_splat_v4f32
-; SIMD128: v128.const $push0=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5{{$}}
 define <4 x float> @const_splat_v4f32() {
+; SIMD128-LABEL: const_splat_v4f32:
+; SIMD128:         .functype const_splat_v4f32 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_splat_v4f32:
+; NO-SIMD128:         .functype const_splat_v4f32 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 4767060206681587712
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 4767060206681587712
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <4 x float> <float 42., float 42., float 42., float 42.>
 }
 
-; CHECK-LABEL: extract_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype extract_v4f32 (v128) -> (f32){{$}}
-; SIMD128-NEXT: f32x4.extract_lane $push[[R:[0-9]+]]=, $0, 3{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define float @extract_v4f32(<4 x float> %v) {
+; SIMD128-LABEL: extract_v4f32:
+; SIMD128:         .functype extract_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v4f32:
+; NO-SIMD128:         .functype extract_v4f32 (f32, f32, f32, f32) -> (f32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $3
   %elem = extractelement <4 x float> %v, i32 3
   ret float %elem
 }
 
-; CHECK-LABEL: extract_var_v4f32:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NEXT: .functype extract_var_v4f32 (v128, i32) -> (f32){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 3{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $2, $pop[[L7]]{{$}}
-; SIMD128-NEXT: f32.load $push[[R:[0-9]+]]=, 0($pop[[L2]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define float @extract_var_v4f32(<4 x float> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v4f32:
+; SIMD128:         .functype extract_var_v4f32 (v128, i32) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $2=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 3
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 2
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $2, $pop3
+; SIMD128-NEXT:    f32.load $push5=, 0($pop4)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: extract_var_v4f32:
+; NO-SIMD128:         .functype extract_var_v4f32 (f32, f32, f32, f32, i32) -> (f32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push7=, 16
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; NO-SIMD128-NEXT:    local.tee $push8=, $5=, $pop9
+; NO-SIMD128-NEXT:    f32.store 12($pop8), $3
+; NO-SIMD128-NEXT:    f32.store 8($5), $2
+; NO-SIMD128-NEXT:    f32.store 4($5), $1
+; NO-SIMD128-NEXT:    f32.store 0($5), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 3
+; NO-SIMD128-NEXT:    i32.and $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $5, $pop3
+; NO-SIMD128-NEXT:    f32.load $push5=, 0($pop4)
+; NO-SIMD128-NEXT:    return $pop5
   %elem = extractelement <4 x float> %v, i32 %i
   ret float %elem
 }
 
-; CHECK-LABEL: extract_zero_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype extract_zero_v4f32 (v128) -> (f32){{$}}
-; SIMD128-NEXT: f32x4.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define float @extract_zero_v4f32(<4 x float> %v) {
+; SIMD128-LABEL: extract_zero_v4f32:
+; SIMD128:         .functype extract_zero_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_zero_v4f32:
+; NO-SIMD128:         .functype extract_zero_v4f32 (f32, f32, f32, f32) -> (f32)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $0
   %elem = extractelement <4 x float> %v, i32 0
   ret float %elem
 }
 
-; CHECK-LABEL: replace_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype replace_v4f32 (v128, f32) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.replace_lane $push[[R:[0-9]+]]=, $0, 2, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
+; SIMD128-LABEL: replace_v4f32:
+; SIMD128:         .functype replace_v4f32 (v128, f32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.replace_lane $push0=, $0, 2, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_v4f32:
+; NO-SIMD128:         .functype replace_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 8($0), $5
+; NO-SIMD128-NEXT:    f32.store 4($0), $2
+; NO-SIMD128-NEXT:    f32.store 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
+; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 2
   ret <4 x float> %res
 }
 
-; CHECK-LABEL: replace_var_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype replace_var_v4f32 (v128, i32, f32) -> (v128){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 3{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $3, $pop[[L7]]{{$}}
-; SIMD128-NEXT: f32.store 0($pop[[L2]]), $2{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @replace_var_v4f32(<4 x float> %v, i32 %i, float %x) {
+; SIMD128-LABEL: replace_var_v4f32:
+; SIMD128:         .functype replace_var_v4f32 (v128, i32, f32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $3=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 3
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 2
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $3, $pop3
+; SIMD128-NEXT:    f32.store 0($pop4), $2
+; SIMD128-NEXT:    v128.load $push5=, 0($3)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: replace_var_v4f32:
+; NO-SIMD128:         .functype replace_var_v4f32 (i32, f32, f32, f32, f32, i32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push7=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push8=, 16
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop7, $pop8
+; NO-SIMD128-NEXT:    local.tee $push9=, $7=, $pop10
+; NO-SIMD128-NEXT:    f32.store 12($pop9), $4
+; NO-SIMD128-NEXT:    f32.store 8($7), $3
+; NO-SIMD128-NEXT:    f32.store 4($7), $2
+; NO-SIMD128-NEXT:    f32.store 0($7), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 3
+; NO-SIMD128-NEXT:    i32.and $push1=, $5, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    f32.store 0($pop4), $6
+; NO-SIMD128-NEXT:    i64.load $push5=, 8($7)
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i64.load $push6=, 0($7)
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop6
+; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 %i
   ret <4 x float> %res
 }
 
-; CHECK-LABEL: replace_zero_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype replace_zero_v4f32 (v128, f32) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @replace_zero_v4f32(<4 x float> %v, float %x) {
+; SIMD128-LABEL: replace_zero_v4f32:
+; SIMD128:         .functype replace_zero_v4f32 (v128, f32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.replace_lane $push0=, $0, 0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_zero_v4f32:
+; NO-SIMD128:         .functype replace_zero_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 8($0), $3
+; NO-SIMD128-NEXT:    f32.store 4($0), $2
+; NO-SIMD128-NEXT:    f32.store 0($0), $5
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
+; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 0
   ret <4 x float> %res
 }
 
-; CHECK-LABEL: shuffle_v4f32:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
-; SIMD128-SAME: 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @shuffle_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: shuffle_v4f32:
+; SIMD128:         .functype shuffle_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $1, 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_v4f32:
+; NO-SIMD128:         .functype shuffle_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 8($0), $3
+; NO-SIMD128-NEXT:    f32.store 4($0), $6
+; NO-SIMD128-NEXT:    f32.store 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    f32.store 0($pop1), $8
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x float> %res
 }
 
-; CHECK-LABEL: shuffle_undef_v4f32:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_undef_v4f32 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
-; SIMD128-SAME: 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
+; SIMD128-LABEL: shuffle_undef_v4f32:
+; SIMD128:         .functype shuffle_undef_v4f32 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_undef_v4f32:
+; NO-SIMD128:         .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 8($0), $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $2
+; NO-SIMD128-NEXT:    f32.store 0($0), $2
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    f32.store 0($pop1), $2
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   ret <4 x float> %res
 }
 
-; CHECK-LABEL: build_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype build_v4f32 (f32, f32, f32, f32) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.splat $push[[L0:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: f32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
-; SIMD128-NEXT: f32x4.replace_lane $push[[L2:[0-9]+]]=, $pop[[L1]], 2, $2{{$}}
-; SIMD128-NEXT: f32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L2]], 3, $3{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @build_v4f32(float %x0, float %x1, float %x2, float %x3) {
+; SIMD128-LABEL: build_v4f32:
+; SIMD128:         .functype build_v4f32 (f32, f32, f32, f32) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.splat $push0=, $0
+; SIMD128-NEXT:    f32x4.replace_lane $push1=, $pop0, 1, $1
+; SIMD128-NEXT:    f32x4.replace_lane $push2=, $pop1, 2, $2
+; SIMD128-NEXT:    f32x4.replace_lane $push3=, $pop2, 3, $3
+; SIMD128-NEXT:    return $pop3
+;
+; NO-SIMD128-LABEL: build_v4f32:
+; NO-SIMD128:         .functype build_v4f32 (i32, f32, f32, f32, f32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 8($0), $3
+; NO-SIMD128-NEXT:    f32.store 4($0), $2
+; NO-SIMD128-NEXT:    f32.store 0($0), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 12
+; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
+; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
+; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x float> undef, float %x0, i32 0
   %t1 = insertelement <4 x float> %t0, float %x1, i32 1
   %t2 = insertelement <4 x float> %t1, float %x2, i32 2
@@ -969,144 +2061,253 @@ define <4 x float> @build_v4f32(float %x0, float %x1, float %x2, float %x3) {
 ; ==============================================================================
 ; 2 x f64
 ; ==============================================================================
-; CHECK-LABEL: const_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype const_v2f64 () -> (v128){{$}}
-; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.60504030201p-911, 0x1.e0d0c0b0a0908p-783{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @const_v2f64() {
+; SIMD128-LABEL: const_v2f64:
+; SIMD128:         .functype const_v2f64 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.60504030201p-911, 0x1.e0d0c0b0a0908p-783
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_v2f64:
+; NO-SIMD128:         .functype const_v2f64 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 1084818905618843912
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 506097522914230528
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <2 x double> <double 0x0706050403020100, double 0x0F0E0D0C0B0A0908>
 }
 
-; CHECK-LABEL: splat_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype splat_v2f64 (f64) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @splat_v2f64(double %x) {
+; SIMD128-LABEL: splat_v2f64:
+; SIMD128:         .functype splat_v2f64 (f64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.splat $push0=, $0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: splat_v2f64:
+; NO-SIMD128:         .functype splat_v2f64 (i32, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.store 8($0), $1
+; NO-SIMD128-NEXT:    f64.store 0($0), $1
+; NO-SIMD128-NEXT:    return
   %t1 = insertelement <2 x double> zeroinitializer, double %x, i3 0
   %res = insertelement <2 x double> %t1, double %x, i32 1
   ret <2 x double> %res
 }
 
-; CHECK-LABEL: const_splat_v2f64:
-; SIMD128: v128.const $push0=, 0x1.5p5, 0x1.5p5{{$}}
 define <2 x double> @const_splat_v2f64() {
+; SIMD128-LABEL: const_splat_v2f64:
+; SIMD128:         .functype const_splat_v2f64 () -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    v128.const $push0=, 0x1.5p5, 0x1.5p5
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: const_splat_v2f64:
+; NO-SIMD128:         .functype const_splat_v2f64 (i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i64.const $push0=, 4631107791820423168
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop0
+; NO-SIMD128-NEXT:    i64.const $push1=, 4631107791820423168
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop1
+; NO-SIMD128-NEXT:    return
   ret <2 x double> <double 42., double 42.>
 }
 
-; CHECK-LABEL: extract_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype extract_v2f64 (v128) -> (f64){{$}}
-; SIMD128-NEXT: f64x2.extract_lane $push[[R:[0-9]+]]=, $0, 1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define double @extract_v2f64(<2 x double> %v) {
+; SIMD128-LABEL: extract_v2f64:
+; SIMD128:         .functype extract_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_v2f64:
+; NO-SIMD128:         .functype extract_v2f64 (f64, f64) -> (f64)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $1
   %elem = extractelement <2 x double> %v, i32 1
   ret double %elem
 }
 
-; CHECK-LABEL: extract_var_v2f64:
-; NO-SIMD128-NOT: i62x2
-; SIMD128-NEXT: .functype extract_var_v2f64 (v128, i32) -> (f64){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $2, $pop[[L7]]{{$}}
-; SIMD128-NEXT: f64.load $push[[R:[0-9]+]]=, 0($pop[[L2]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define double @extract_var_v2f64(<2 x double> %v, i32 %i) {
+; SIMD128-LABEL: extract_var_v2f64:
+; SIMD128:         .functype extract_var_v2f64 (v128, i32) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $2=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 1
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 3
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $2, $pop3
+; SIMD128-NEXT:    f64.load $push5=, 0($pop4)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: extract_var_v2f64:
+; NO-SIMD128:         .functype extract_var_v2f64 (f64, f64, i32) -> (f64)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push7=, 16
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; NO-SIMD128-NEXT:    local.tee $push8=, $3=, $pop9
+; NO-SIMD128-NEXT:    f64.store 8($pop8), $1
+; NO-SIMD128-NEXT:    f64.store 0($3), $0
+; NO-SIMD128-NEXT:    i32.const $push0=, 1
+; NO-SIMD128-NEXT:    i32.and $push1=, $2, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 3
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $3, $pop3
+; NO-SIMD128-NEXT:    f64.load $push5=, 0($pop4)
+; NO-SIMD128-NEXT:    return $pop5
   %elem = extractelement <2 x double> %v, i32 %i
   ret double %elem
 }
 
-; CHECK-LABEL: extract_zero_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype extract_zero_v2f64 (v128) -> (f64){{$}}
-; SIMD128-NEXT: f64x2.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define double @extract_zero_v2f64(<2 x double> %v) {
+; SIMD128-LABEL: extract_zero_v2f64:
+; SIMD128:         .functype extract_zero_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 0
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: extract_zero_v2f64:
+; NO-SIMD128:         .functype extract_zero_v2f64 (f64, f64) -> (f64)
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    return $0
   %elem = extractelement <2 x double> %v, i32 0
   ret double %elem
 }
 
-; CHECK-LABEL: replace_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype replace_v2f64 (v128, f64) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @replace_v2f64(<2 x double> %v, double %x) {
+; SIMD128-LABEL: replace_v2f64:
+; SIMD128:         .functype replace_v2f64 (v128, f64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.replace_lane $push0=, $0, 0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_v2f64:
+; NO-SIMD128:         .functype replace_v2f64 (i32, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.store 8($0), $2
+; NO-SIMD128-NEXT:    f64.store 0($0), $3
+; NO-SIMD128-NEXT:    return
   %res = insertelement <2 x double> %v, double %x, i32 0
   ret <2 x double> %res
 }
 
-; CHECK-LABEL: replace_var_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype replace_var_v2f64 (v128, i32, f64) -> (v128){{$}}
-; SIMD128-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer{{$}}
-; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
-; SIMD128-NEXT: local.tee $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
-; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
-; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
-; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
-; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
-; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $3, $pop[[L7]]{{$}}
-; SIMD128-NEXT: f64.store 0($pop[[L2]]), $2{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @replace_var_v2f64(<2 x double> %v, i32 %i, double %x) {
+; SIMD128-LABEL: replace_var_v2f64:
+; SIMD128:         .functype replace_var_v2f64 (v128, i32, f64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    global.get $push6=, __stack_pointer
+; SIMD128-NEXT:    i32.const $push7=, 16
+; SIMD128-NEXT:    i32.sub $push9=, $pop6, $pop7
+; SIMD128-NEXT:    local.tee $push8=, $3=, $pop9
+; SIMD128-NEXT:    v128.store 0($pop8), $0
+; SIMD128-NEXT:    i32.const $push0=, 1
+; SIMD128-NEXT:    i32.and $push1=, $1, $pop0
+; SIMD128-NEXT:    i32.const $push2=, 3
+; SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; SIMD128-NEXT:    i32.or $push4=, $3, $pop3
+; SIMD128-NEXT:    f64.store 0($pop4), $2
+; SIMD128-NEXT:    v128.load $push5=, 0($3)
+; SIMD128-NEXT:    return $pop5
+;
+; NO-SIMD128-LABEL: replace_var_v2f64:
+; NO-SIMD128:         .functype replace_var_v2f64 (i32, f64, f64, i32, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    global.get $push7=, __stack_pointer
+; NO-SIMD128-NEXT:    i32.const $push8=, 16
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop7, $pop8
+; NO-SIMD128-NEXT:    local.tee $push9=, $5=, $pop10
+; NO-SIMD128-NEXT:    f64.store 8($pop9), $2
+; NO-SIMD128-NEXT:    f64.store 0($5), $1
+; NO-SIMD128-NEXT:    i32.const $push0=, 1
+; NO-SIMD128-NEXT:    i32.and $push1=, $3, $pop0
+; NO-SIMD128-NEXT:    i32.const $push2=, 3
+; NO-SIMD128-NEXT:    i32.shl $push3=, $pop1, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $5, $pop3
+; NO-SIMD128-NEXT:    f64.store 0($pop4), $4
+; NO-SIMD128-NEXT:    f64.load $push5=, 8($5)
+; NO-SIMD128-NEXT:    f64.store 8($0), $pop5
+; NO-SIMD128-NEXT:    f64.load $push6=, 0($5)
+; NO-SIMD128-NEXT:    f64.store 0($0), $pop6
+; NO-SIMD128-NEXT:    return
   %res = insertelement <2 x double> %v, double %x, i32 %i
   ret <2 x double> %res
 }
 
-; CHECK-LABEL: replace_zero_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype replace_zero_v2f64 (v128, f64) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @replace_zero_v2f64(<2 x double> %v, double %x) {
+; SIMD128-LABEL: replace_zero_v2f64:
+; SIMD128:         .functype replace_zero_v2f64 (v128, f64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.replace_lane $push0=, $0, 0, $1
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: replace_zero_v2f64:
+; NO-SIMD128:         .functype replace_zero_v2f64 (i32, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.store 8($0), $2
+; NO-SIMD128-NEXT:    f64.store 0($0), $3
+; NO-SIMD128-NEXT:    return
   %res = insertelement <2 x double> %v, double %x, i32 0
   ret <2 x double> %res
 }
 
-; CHECK-LABEL: shuffle_v2f64:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
-; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @shuffle_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: shuffle_v2f64:
+; SIMD128:         .functype shuffle_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $1, 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_v2f64:
+; NO-SIMD128:         .functype shuffle_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.store 8($0), $4
+; NO-SIMD128-NEXT:    f64.store 0($0), $1
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <2 x double> %x, <2 x double> %y,
     <2 x i32> <i32 0, i32 3>
   ret <2 x double> %res
 }
 
-; CHECK-LABEL: shuffle_undef_v2f64:
-; NO-SIMD128-NOT: i8x16
-; SIMD128-NEXT: .functype shuffle_undef_v2f64 (v128, v128) -> (v128){{$}}
-; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
-; SIMD128-SAME: 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) {
+; SIMD128-LABEL: shuffle_undef_v2f64:
+; SIMD128:         .functype shuffle_undef_v2f64 (v128, v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    return $pop0
+;
+; NO-SIMD128-LABEL: shuffle_undef_v2f64:
+; NO-SIMD128:         .functype shuffle_undef_v2f64 (i32, f64, f64, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.store 8($0), $2
+; NO-SIMD128-NEXT:    f64.store 0($0), $2
+; NO-SIMD128-NEXT:    return
   %res = shufflevector <2 x double> %x, <2 x double> %y,
     <2 x i32> <i32 1, i32 undef>
   ret <2 x double> %res
 }
 
-; CHECK-LABEL: build_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype build_v2f64 (f64, f64) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.splat $push[[L0:[0-9]+]]=, $0{{$}}
-; SIMD128-NEXT: f64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L0]], 1, $1{{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @build_v2f64(double %x0, double %x1) {
+; SIMD128-LABEL: build_v2f64:
+; SIMD128:         .functype build_v2f64 (f64, f64) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.splat $push0=, $0
+; SIMD128-NEXT:    f64x2.replace_lane $push1=, $pop0, 1, $1
+; SIMD128-NEXT:    return $pop1
+;
+; NO-SIMD128-LABEL: build_v2f64:
+; NO-SIMD128:         .functype build_v2f64 (i32, f64, f64) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f64.store 8($0), $2
+; NO-SIMD128-NEXT:    f64.store 0($0), $1
+; NO-SIMD128-NEXT:    return
   %t0 = insertelement <2 x double> undef, double %x0, i32 0
   %res = insertelement <2 x double> %t0, double %x1, i32 1
   ret <2 x double> %res

diff  --git a/llvm/test/CodeGen/WebAssembly/stack-protector.ll b/llvm/test/CodeGen/WebAssembly/stack-protector.ll
index 3a97849b5920e..1b36b614515fd 100644
--- a/llvm/test/CodeGen/WebAssembly/stack-protector.ll
+++ b/llvm/test/CodeGen/WebAssembly/stack-protector.ll
@@ -1,14 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=wasm32-unknown-unknown < %s | FileCheck -check-prefix=WASM32 %s
 
 @"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00"		; <ptr> [#uses=1]
 
-; WASM32-LABEL: test:
-; WASM32:      i32.load        28
-; WASM32:      br_if           0
-; WASM32:      call __stack_chk_fail
-; WASM32-NEXT: unreachable
-
 define void @test(ptr %a) nounwind ssp {
+; WASM32-LABEL: test:
+; WASM32:         .functype test (i32) -> ()
+; WASM32-NEXT:    .local i32
+; WASM32-NEXT:  # %bb.0: # %entry
+; WASM32-NEXT:    global.get __stack_pointer
+; WASM32-NEXT:    i32.const 32
+; WASM32-NEXT:    i32.sub
+; WASM32-NEXT:    local.tee 1
+; WASM32-NEXT:    global.set __stack_pointer
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.store 16
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 0
+; WASM32-NEXT:    i32.load __stack_chk_guard
+; WASM32-NEXT:    i32.store 28
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 20
+; WASM32-NEXT:    i32.add
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    call strcpy
+; WASM32-NEXT:    drop
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 20
+; WASM32-NEXT:    i32.add
+; WASM32-NEXT:    i32.store 0
+; WASM32-NEXT:    i32.const LC
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    call printf
+; WASM32-NEXT:    drop
+; WASM32-NEXT:    block
+; WASM32-NEXT:    i32.const 0
+; WASM32-NEXT:    i32.load __stack_chk_guard
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.load 28
+; WASM32-NEXT:    i32.eq
+; WASM32-NEXT:    br_if 0 # 0: down to label0
+; WASM32-NEXT:  # %bb.1: # %return
+; WASM32-NEXT:    call __stack_chk_fail
+; WASM32-NEXT:    unreachable
+; WASM32-NEXT:  .LBB0_2: # %return
+; WASM32-NEXT:    end_block # label0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 32
+; WASM32-NEXT:    i32.add
+; WASM32-NEXT:    global.set __stack_pointer
+; WASM32-NEXT:    # fallthrough-return
 entry:
 	%a_addr = alloca ptr		; <ptr> [#uses=2]
 	%buf = alloca [8 x i8]		; <ptr> [#uses=2]
@@ -23,11 +66,56 @@ return:		; preds = %entry
 	ret void
 }
 
-; WASM32-LABEL: test_return_i32:
-; WASM32:      call __stack_chk_fail
-; WASM32-NEXT: unreachable
-
 define i32 @test_return_i32(ptr %a) nounwind ssp {
+; WASM32-LABEL: test_return_i32:
+; WASM32:         .functype test_return_i32 (i32) -> (i32)
+; WASM32-NEXT:    .local i32
+; WASM32-NEXT:  # %bb.0: # %entry
+; WASM32-NEXT:    global.get __stack_pointer
+; WASM32-NEXT:    i32.const 32
+; WASM32-NEXT:    i32.sub
+; WASM32-NEXT:    local.tee 1
+; WASM32-NEXT:    global.set __stack_pointer
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32.store 16
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 0
+; WASM32-NEXT:    i32.load __stack_chk_guard
+; WASM32-NEXT:    i32.store 28
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 20
+; WASM32-NEXT:    i32.add
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    call strcpy
+; WASM32-NEXT:    drop
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 20
+; WASM32-NEXT:    i32.add
+; WASM32-NEXT:    i32.store 0
+; WASM32-NEXT:    i32.const LC
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    call printf
+; WASM32-NEXT:    drop
+; WASM32-NEXT:    block
+; WASM32-NEXT:    i32.const 0
+; WASM32-NEXT:    i32.load __stack_chk_guard
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.load 28
+; WASM32-NEXT:    i32.eq
+; WASM32-NEXT:    br_if 0 # 0: down to label1
+; WASM32-NEXT:  # %bb.1: # %return
+; WASM32-NEXT:    call __stack_chk_fail
+; WASM32-NEXT:    unreachable
+; WASM32-NEXT:  .LBB1_2: # %return
+; WASM32-NEXT:    end_block # label1:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 32
+; WASM32-NEXT:    i32.add
+; WASM32-NEXT:    global.set __stack_pointer
+; WASM32-NEXT:    i32.const 0
+; WASM32-NEXT:    # fallthrough-return
 entry:
   %a_addr = alloca ptr    ; <ptr> [#uses=2]
   %buf = alloca [8 x i8]    ; <ptr> [#uses=2]

diff  --git a/llvm/test/CodeGen/WebAssembly/umulo-i64.ll b/llvm/test/CodeGen/WebAssembly/umulo-i64.ll
index dabe643e32283..85b6bd244090a 100644
--- a/llvm/test/CodeGen/WebAssembly/umulo-i64.ll
+++ b/llvm/test/CodeGen/WebAssembly/umulo-i64.ll
@@ -1,11 +1,24 @@
-; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
 ; Test that UMULO works correctly on 64-bit operands.
 target triple = "wasm32-unknown-unknown"
 
-; CHECK-LABEL: _ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE:
-; CHECK:     __multi3
 ; Function Attrs: inlinehint
 define void @"_ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE"(i64, i64) unnamed_addr #0 {
+; CHECK-LABEL: _ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE:
+; CHECK:         .functype _ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE (i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-NEXT:    i32.const $push3=, 16
+; CHECK-NEXT:    i32.sub $push6=, $pop2, $pop3
+; CHECK-NEXT:    local.tee $push5=, $2=, $pop6
+; CHECK-NEXT:    global.set __stack_pointer, $pop5
+; CHECK-NEXT:    i64.const $push0=, 0
+; CHECK-NEXT:    i64.const $push4=, 0
+; CHECK-NEXT:    call __multi3, $2, $0, $pop0, $1, $pop4
+; CHECK-NEXT:    i64.load $push1=, 0($2)
+; CHECK-NEXT:    i64.store 0($2), $pop1
+; CHECK-NEXT:    unreachable
 start:
   %2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %0, i64 %1)
   %3 = extractvalue { i64, i1 } %2, 0
@@ -19,10 +32,25 @@ declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #1
 attributes #0 = { inlinehint }
 attributes #1 = { nounwind readnone speculatable }
 
-; CHECK-LABEL: wut:
-; CHECK: call     __multi3, $2, $0, $pop0, $1, $pop7
-; CHECK: i64.load $1=, 8($2)
 define i1 @wut(i64, i64) {
+; CHECK-LABEL: wut:
+; CHECK:         .functype wut (i64, i64) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-NEXT:    i32.const $push3=, 16
+; CHECK-NEXT:    i32.sub $push9=, $pop2, $pop3
+; CHECK-NEXT:    local.tee $push8=, $2=, $pop9
+; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    i64.const $push0=, 0
+; CHECK-NEXT:    i64.const $push7=, 0
+; CHECK-NEXT:    call __multi3, $2, $0, $pop0, $1, $pop7
+; CHECK-NEXT:    i64.load $1=, 8($2)
+; CHECK-NEXT:    i32.const $push4=, 16
+; CHECK-NEXT:    i32.add $push5=, $2, $pop4
+; CHECK-NEXT:    global.set __stack_pointer, $pop5
+; CHECK-NEXT:    i64.const $push6=, 0
+; CHECK-NEXT:    i64.ne $push1=, $1, $pop6
+; CHECK-NEXT:    # fallthrough-return
 start:
   %2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %0, i64 %1)
   %3 = extractvalue { i64, i1 } %2, 1

diff  --git a/llvm/test/CodeGen/WebAssembly/userstack.ll b/llvm/test/CodeGen/WebAssembly/userstack.ll
index 61706dbca1656..98218db866e38 100644
--- a/llvm/test/CodeGen/WebAssembly/userstack.ll
+++ b/llvm/test/CodeGen/WebAssembly/userstack.ll
@@ -1,287 +1,669 @@
-; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck -DPTR=32 %s
-; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck -DPTR=64 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=wasm32-unknown-unknown -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck -DPTR=32 %s --check-prefix=CHECK-32
+; RUN: llc < %s --mtriple=wasm64-unknown-unknown -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck -DPTR=64 %s --check-prefix=CHECK-64
 
 declare void @ext_func(ptr %ptr)
 declare void @ext_func_i32(ptr %ptr)
 
-; CHECK: .globaltype	__stack_pointer, i[[PTR]]{{$}}
-
-; CHECK-LABEL: alloca32:
 ; Check that there is an extra local for the stack pointer.
-; CHECK: .local i[[PTR]]{{$}}
 define void @alloca32() noredzone {
- ; CHECK-NEXT: global.get $push[[L2:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: i[[PTR]].const $push[[L3:.+]]=, 16
- ; CHECK-NEXT: i[[PTR]].sub $push[[L9:.+]]=, $pop[[L2]], $pop[[L3]]
- ; CHECK-NEXT: local.tee $push[[L8:.+]]=, [[SP:.+]], $pop[[L9]]{{$}}
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L8]]{{$}}
+; CHECK-32-LABEL: alloca32:
+; CHECK-32:         .functype alloca32 () -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push1=, __stack_pointer
+; CHECK-32-NEXT:    i32.const $push2=, 16
+; CHECK-32-NEXT:    i32.sub $push6=, $pop1, $pop2
+; CHECK-32-NEXT:    local.tee $push5=, 0, $pop6
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop5
+; CHECK-32-NEXT:    local.get $push7=, 0
+; CHECK-32-NEXT:    i32.const $push0=, 0
+; CHECK-32-NEXT:    i32.store 12($pop7), $pop0
+; CHECK-32-NEXT:    local.get $push8=, 0
+; CHECK-32-NEXT:    i32.const $push3=, 16
+; CHECK-32-NEXT:    i32.add $push4=, $pop8, $pop3
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: alloca32:
+; CHECK-64:         .functype alloca32 () -> ()
+; CHECK-64-NEXT:    .local i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push1=, __stack_pointer
+; CHECK-64-NEXT:    i64.const $push2=, 16
+; CHECK-64-NEXT:    i64.sub $push6=, $pop1, $pop2
+; CHECK-64-NEXT:    local.tee $push5=, 0, $pop6
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop5
+; CHECK-64-NEXT:    local.get $push7=, 0
+; CHECK-64-NEXT:    i32.const $push0=, 0
+; CHECK-64-NEXT:    i32.store 12($pop7), $pop0
+; CHECK-64-NEXT:    local.get $push8=, 0
+; CHECK-64-NEXT:    i64.const $push3=, 16
+; CHECK-64-NEXT:    i64.add $push4=, $pop8, $pop3
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-64-NEXT:    return
  %retval = alloca i32
- ; CHECK: local.get $push[[L4:.+]]=, [[SP]]{{$}}
- ; CHECK: i32.const $push[[L0:.+]]=, 0
- ; CHECK: i32.store 12($pop[[L4]]), $pop[[L0]]
  store i32 0, ptr %retval
- ; CHECK: local.get $push[[L6:.+]]=, [[SP]]{{$}}
- ; CHECK-NEXT: i[[PTR]].const $push[[L5:.+]]=, 16
- ; CHECK-NEXT: i[[PTR]].add $push[[L7:.+]]=, $pop[[L6]], $pop[[L5]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L7]]
  ret void
 }
 
-; CHECK-LABEL: alloca3264:
-; CHECK: .local i[[PTR]]{{$}}
 define void @alloca3264() {
- ; CHECK: global.get $push[[L3:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: i[[PTR]].const $push[[L4:.+]]=, 16
- ; CHECK-NEXT: i[[PTR]].sub $push[[L6:.+]]=, $pop[[L3]], $pop[[L4]]
- ; CHECK-NEXT: local.tee $push[[L5:.+]]=, [[SP:.+]], $pop[[L6]]
+; CHECK-32-LABEL: alloca3264:
+; CHECK-32:         .functype alloca3264 () -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-32-NEXT:    i32.const $push3=, 16
+; CHECK-32-NEXT:    i32.sub $push5=, $pop2, $pop3
+; CHECK-32-NEXT:    local.tee $push4=, 0, $pop5
+; CHECK-32-NEXT:    i64.const $push0=, 0
+; CHECK-32-NEXT:    i64.store 0($pop4), $pop0
+; CHECK-32-NEXT:    local.get $push6=, 0
+; CHECK-32-NEXT:    i32.const $push1=, 0
+; CHECK-32-NEXT:    i32.store 12($pop6), $pop1
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: alloca3264:
+; CHECK-64:         .functype alloca3264 () -> ()
+; CHECK-64-NEXT:    .local i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-64-NEXT:    i64.const $push3=, 16
+; CHECK-64-NEXT:    i64.sub $push5=, $pop2, $pop3
+; CHECK-64-NEXT:    local.tee $push4=, 0, $pop5
+; CHECK-64-NEXT:    i64.const $push0=, 0
+; CHECK-64-NEXT:    i64.store 0($pop4), $pop0
+; CHECK-64-NEXT:    local.get $push6=, 0
+; CHECK-64-NEXT:    i32.const $push1=, 0
+; CHECK-64-NEXT:    i32.store 12($pop6), $pop1
+; CHECK-64-NEXT:    return
  %r1 = alloca i32
  %r2 = alloca double
  store i32 0, ptr %r1
  store double 0.0, ptr %r2
- ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0
- ; CHECK-NEXT: i64.store 0($pop[[L5]]), $pop[[L1]]
- ; CHECK-NEXT: local.get $push[[L2:.+]]=, [[SP]]{{$}}
- ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0
- ; CHECK-NEXT: i32.store 12($pop[[L2]]), $pop[[L0]]
- ; CHECK-NEXT: return
  ret void
 }
 
-; CHECK-LABEL: allocarray:
-; CHECK: .local i[[PTR]]{{$}}
 define void @allocarray() {
- ; CHECK-NEXT: global.get $push[[L4:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: i[[PTR]].const $push[[L5:.+]]=, 144{{$}}
- ; CHECK-NEXT: i[[PTR]].sub $push[[L12:.+]]=, $pop[[L4]], $pop[[L5]]
- ; CHECK-NEXT: local.tee $push[[L11:.+]]=, 0, $pop[[L12]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L11]]
+; CHECK-32-LABEL: allocarray:
+; CHECK-32:         .functype allocarray () -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-32-NEXT:    i32.const $push4=, 144
+; CHECK-32-NEXT:    i32.sub $push9=, $pop3, $pop4
+; CHECK-32-NEXT:    local.tee $push8=, 0, $pop9
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-32-NEXT:    local.get $push10=, 0
+; CHECK-32-NEXT:    i32.const $push0=, 24
+; CHECK-32-NEXT:    i32.add $push1=, $pop10, $pop0
+; CHECK-32-NEXT:    i32.const $push2=, 1
+; CHECK-32-NEXT:    i32.store 0($pop1), $pop2
+; CHECK-32-NEXT:    local.get $push11=, 0
+; CHECK-32-NEXT:    i32.const $push7=, 1
+; CHECK-32-NEXT:    i32.store 12($pop11), $pop7
+; CHECK-32-NEXT:    local.get $push12=, 0
+; CHECK-32-NEXT:    i32.const $push5=, 144
+; CHECK-32-NEXT:    i32.add $push6=, $pop12, $pop5
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: allocarray:
+; CHECK-64:         .functype allocarray () -> ()
+; CHECK-64-NEXT:    .local i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-64-NEXT:    i64.const $push4=, 144
+; CHECK-64-NEXT:    i64.sub $push9=, $pop3, $pop4
+; CHECK-64-NEXT:    local.tee $push8=, 0, $pop9
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-64-NEXT:    local.get $push10=, 0
+; CHECK-64-NEXT:    i64.const $push0=, 24
+; CHECK-64-NEXT:    i64.add $push1=, $pop10, $pop0
+; CHECK-64-NEXT:    i32.const $push2=, 1
+; CHECK-64-NEXT:    i32.store 0($pop1), $pop2
+; CHECK-64-NEXT:    local.get $push11=, 0
+; CHECK-64-NEXT:    i32.const $push7=, 1
+; CHECK-64-NEXT:    i32.store 12($pop11), $pop7
+; CHECK-64-NEXT:    local.get $push12=, 0
+; CHECK-64-NEXT:    i64.const $push5=, 144
+; CHECK-64-NEXT:    i64.add $push6=, $pop12, $pop5
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-64-NEXT:    return
  %r = alloca [33 x i32]
-
- ; CHECK:      i[[PTR]].const $push{{.+}}=, 24
- ; CHECK-NEXT: i[[PTR]].add $push[[L3:.+]]=, $pop{{.+}}, $pop{{.+}}
- ; CHECK-NEXT: i32.const $push[[L1:.+]]=, 1{{$}}
- ; CHECK-NEXT: i32.store 0($pop[[L3]]), $pop[[L1]]{{$}}
- ; CHECK-NEXT: local.get $push[[L4:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.const $push[[L10:.+]]=, 1{{$}}
- ; CHECK-NEXT: i32.store 12($pop[[L4]]), $pop[[L10]]{{$}}
  store i32 1, ptr %r
  %p2 = getelementptr [33 x i32], ptr %r, i32 0, i32 3
  store i32 1, ptr %p2
-
- ; CHECK-NEXT: local.get $push[[L2:.+]]=, [[SP]]{{$}}
- ; CHECK-NEXT: i[[PTR]].const $push[[L7:.+]]=, 144
- ; CHECK-NEXT: i[[PTR]].add $push[[L8:.+]]=, $pop[[L2]], $pop[[L7]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L8]]
  ret void
 }
 
-; CHECK-LABEL: non_mem_use
 define void @non_mem_use(ptr %addr) {
- ; CHECK: i[[PTR]].const $push[[L2:.+]]=, 48
- ; CHECK-NEXT: i[[PTR]].sub $push[[L12:.+]]=, {{.+}}, $pop[[L2]]
- ; CHECK-NEXT: local.tee $push[[L11:.+]]=, [[SP:.+]], $pop[[L12]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L11]]
+; CHECK-32-LABEL: non_mem_use:
+; CHECK-32:         .functype non_mem_use (i32) -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-32-NEXT:    i32.const $push1=, 48
+; CHECK-32-NEXT:    i32.sub $push9=, $pop0, $pop1
+; CHECK-32-NEXT:    local.tee $push8=, 1, $pop9
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-32-NEXT:    local.get $push10=, 1
+; CHECK-32-NEXT:    i32.const $push4=, 8
+; CHECK-32-NEXT:    i32.add $push5=, $pop10, $pop4
+; CHECK-32-NEXT:    call ext_func, $pop5
+; CHECK-32-NEXT:    local.get $push11=, 1
+; CHECK-32-NEXT:    call ext_func, $pop11
+; CHECK-32-NEXT:    local.get $push13=, 0
+; CHECK-32-NEXT:    local.get $push12=, 1
+; CHECK-32-NEXT:    i32.const $push6=, 16
+; CHECK-32-NEXT:    i32.add $push7=, $pop12, $pop6
+; CHECK-32-NEXT:    i32.store 0($pop13), $pop7
+; CHECK-32-NEXT:    local.get $push14=, 1
+; CHECK-32-NEXT:    i32.const $push2=, 48
+; CHECK-32-NEXT:    i32.add $push3=, $pop14, $pop2
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop3
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: non_mem_use:
+; CHECK-64:         .functype non_mem_use (i64) -> ()
+; CHECK-64-NEXT:    .local i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-64-NEXT:    i64.const $push1=, 48
+; CHECK-64-NEXT:    i64.sub $push9=, $pop0, $pop1
+; CHECK-64-NEXT:    local.tee $push8=, 1, $pop9
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-64-NEXT:    local.get $push10=, 1
+; CHECK-64-NEXT:    i64.const $push4=, 8
+; CHECK-64-NEXT:    i64.add $push5=, $pop10, $pop4
+; CHECK-64-NEXT:    call ext_func, $pop5
+; CHECK-64-NEXT:    local.get $push11=, 1
+; CHECK-64-NEXT:    call ext_func, $pop11
+; CHECK-64-NEXT:    local.get $push13=, 0
+; CHECK-64-NEXT:    local.get $push12=, 1
+; CHECK-64-NEXT:    i64.const $push6=, 16
+; CHECK-64-NEXT:    i64.add $push7=, $pop12, $pop6
+; CHECK-64-NEXT:    i64.store 0($pop13), $pop7
+; CHECK-64-NEXT:    local.get $push14=, 1
+; CHECK-64-NEXT:    i64.const $push2=, 48
+; CHECK-64-NEXT:    i64.add $push3=, $pop14, $pop2
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop3
+; CHECK-64-NEXT:    return
  %buf = alloca [27 x i8], align 16
  %r = alloca i64
  %r2 = alloca i64
  ; %r is at SP+8
- ; CHECK: local.get $push[[L3:.+]]=, [[SP]]
- ; CHECK: i[[PTR]].const $push[[OFF:.+]]=, 8
- ; CHECK-NEXT: i[[PTR]].add $push[[ARG1:.+]]=, $pop[[L3]], $pop[[OFF]]
- ; CHECK-NEXT: call ext_func, $pop[[ARG1]]
  call void @ext_func(ptr %r)
  ; %r2 is at SP+0, no add needed
- ; CHECK: local.get $push[[L4:.+]]=, [[SP]]
- ; CHECK-NEXT: call ext_func, $pop[[L4]]
  call void @ext_func(ptr %r2)
  ; Use as a value, but in a store
  ; %buf is at SP+16
- ; CHECK: local.get $push[[L5:.+]]=, [[SP]]
- ; CHECK: i[[PTR]].const $push[[OFF:.+]]=, 16
- ; CHECK-NEXT: i[[PTR]].add $push[[VAL:.+]]=, $pop[[L5]], $pop[[OFF]]
- ; CHECK-NEXT: i[[PTR]].store 0($pop{{.+}}), $pop[[VAL]]
  store ptr %buf, ptr %addr
  ret void
 }
 
-; CHECK-LABEL: allocarray_inbounds:
-; CHECK: .local i[[PTR]]{{$}}
 define void @allocarray_inbounds() {
- ; CHECK: global.get $push[[L3:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: i[[PTR]].const $push[[L4:.+]]=, 32{{$}}
- ; CHECK-NEXT: i[[PTR]].sub $push[[L11:.+]]=, $pop[[L3]], $pop[[L4]]
- ; CHECK-NEXT: local.tee $push[[L10:.+]]=, [[SP:.+]], $pop[[L11]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L10]]{{$}}
+; CHECK-32-LABEL: allocarray_inbounds:
+; CHECK-32:         .functype allocarray_inbounds () -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-32-NEXT:    i32.const $push3=, 32
+; CHECK-32-NEXT:    i32.sub $push8=, $pop2, $pop3
+; CHECK-32-NEXT:    local.tee $push7=, 0, $pop8
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-32-NEXT:    local.get $push9=, 0
+; CHECK-32-NEXT:    i32.const $push0=, 1
+; CHECK-32-NEXT:    i32.store 24($pop9), $pop0
+; CHECK-32-NEXT:    local.get $push10=, 0
+; CHECK-32-NEXT:    i32.const $push6=, 1
+; CHECK-32-NEXT:    i32.store 12($pop10), $pop6
+; CHECK-32-NEXT:    i32.const $push1=, 0
+; CHECK-32-NEXT:    call ext_func, $pop1
+; CHECK-32-NEXT:    local.get $push11=, 0
+; CHECK-32-NEXT:    i32.const $push4=, 32
+; CHECK-32-NEXT:    i32.add $push5=, $pop11, $pop4
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop5
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: allocarray_inbounds:
+; CHECK-64:         .functype allocarray_inbounds () -> ()
+; CHECK-64-NEXT:    .local i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-64-NEXT:    i64.const $push3=, 32
+; CHECK-64-NEXT:    i64.sub $push8=, $pop2, $pop3
+; CHECK-64-NEXT:    local.tee $push7=, 0, $pop8
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-64-NEXT:    local.get $push9=, 0
+; CHECK-64-NEXT:    i32.const $push0=, 1
+; CHECK-64-NEXT:    i32.store 24($pop9), $pop0
+; CHECK-64-NEXT:    local.get $push10=, 0
+; CHECK-64-NEXT:    i32.const $push6=, 1
+; CHECK-64-NEXT:    i32.store 12($pop10), $pop6
+; CHECK-64-NEXT:    i64.const $push1=, 0
+; CHECK-64-NEXT:    call ext_func, $pop1
+; CHECK-64-NEXT:    local.get $push11=, 0
+; CHECK-64-NEXT:    i64.const $push4=, 32
+; CHECK-64-NEXT:    i64.add $push5=, $pop11, $pop4
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop5
+; CHECK-64-NEXT:    return
  %r = alloca [5 x i32]
- ; CHECK: i32.const $push[[L3:.+]]=, 1
- ; CHECK-DAG: i32.store 24(${{.+}}), $pop[[L3]]
  store i32 1, ptr %r
  ; This store should have both the GEP and the FI folded into it.
- ; CHECK-DAG: i32.store 12(${{.+}}), $pop
  %p2 = getelementptr inbounds [5 x i32], ptr %r, i32 0, i32 3
  store i32 1, ptr %p2
  call void @ext_func(ptr null);
- ; CHECK: call ext_func
- ; CHECK: i[[PTR]].const $push[[L5:.+]]=, 32{{$}}
- ; CHECK-NEXT: i[[PTR]].add $push[[L7:.+]]=, ${{.+}}, $pop[[L5]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L7]]
  ret void
 }
 
-; CHECK-LABEL: dynamic_alloca:
 define void @dynamic_alloca(i32 %alloc) {
- ; CHECK: global.get $push[[L13:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: local.tee $push[[L12:.+]]=, [[SP:.+]], $pop[[L13]]{{$}}
  ; Target independent codegen bumps the stack pointer.
- ; CHECK: i[[PTR]].sub
  ; Check that SP is written back to memory after decrement
- ; CHECK: global.set __stack_pointer,
+; CHECK-32-LABEL: dynamic_alloca:
+; CHECK-32:         .functype dynamic_alloca (i32) -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push10=, __stack_pointer
+; CHECK-32-NEXT:    local.tee $push9=, 1, $pop10
+; CHECK-32-NEXT:    local.get $push11=, 0
+; CHECK-32-NEXT:    i32.const $push0=, 2
+; CHECK-32-NEXT:    i32.shl $push1=, $pop11, $pop0
+; CHECK-32-NEXT:    i32.const $push2=, 15
+; CHECK-32-NEXT:    i32.add $push3=, $pop1, $pop2
+; CHECK-32-NEXT:    i32.const $push4=, -16
+; CHECK-32-NEXT:    i32.and $push5=, $pop3, $pop4
+; CHECK-32-NEXT:    i32.sub $push8=, $pop9, $pop5
+; CHECK-32-NEXT:    local.tee $push7=, 0, $pop8
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-32-NEXT:    local.get $push12=, 0
+; CHECK-32-NEXT:    call ext_func_i32, $pop12
+; CHECK-32-NEXT:    local.get $push6=, 1
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: dynamic_alloca:
+; CHECK-64:         .functype dynamic_alloca (i32) -> ()
+; CHECK-64-NEXT:    .local i64, i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push11=, __stack_pointer
+; CHECK-64-NEXT:    local.tee $push10=, 1, $pop11
+; CHECK-64-NEXT:    local.get $push12=, 0
+; CHECK-64-NEXT:    i64.extend_i32_u $push0=, $pop12
+; CHECK-64-NEXT:    i64.const $push1=, 2
+; CHECK-64-NEXT:    i64.shl $push2=, $pop0, $pop1
+; CHECK-64-NEXT:    i64.const $push3=, 15
+; CHECK-64-NEXT:    i64.add $push4=, $pop2, $pop3
+; CHECK-64-NEXT:    i64.const $push5=, 34359738352
+; CHECK-64-NEXT:    i64.and $push6=, $pop4, $pop5
+; CHECK-64-NEXT:    i64.sub $push9=, $pop10, $pop6
+; CHECK-64-NEXT:    local.tee $push8=, 2, $pop9
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-64-NEXT:    local.get $push13=, 2
+; CHECK-64-NEXT:    call ext_func_i32, $pop13
+; CHECK-64-NEXT:    local.get $push7=, 1
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-64-NEXT:    return
  %r = alloca i32, i32 %alloc
  ; Target-independent codegen also calculates the store addr
- ; CHECK: call ext_func_i32
  call void @ext_func_i32(ptr %r)
- ; CHECK: global.set __stack_pointer, $pop{{.+}}
  ret void
 }
 
-; CHECK-LABEL: dynamic_alloca_redzone:
 define void @dynamic_alloca_redzone(i32 %alloc) {
- ; CHECK: global.get $push[[L13:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: local.tee $push[[L12:.+]]=, [[SP:.+]], $pop[[L13]]{{$}}
  ; Target independent codegen bumps the stack pointer
- ; CHECK: i[[PTR]].sub
+; CHECK-32-LABEL: dynamic_alloca_redzone:
+; CHECK-32:         .functype dynamic_alloca_redzone (i32) -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push8=, __stack_pointer
+; CHECK-32-NEXT:    local.tee $push9=, 1, $pop8
+; CHECK-32-NEXT:    drop $pop9
+; CHECK-32-NEXT:    local.get $push11=, 1
+; CHECK-32-NEXT:    local.get $push10=, 0
+; CHECK-32-NEXT:    i32.const $push0=, 2
+; CHECK-32-NEXT:    i32.shl $push1=, $pop10, $pop0
+; CHECK-32-NEXT:    i32.const $push2=, 15
+; CHECK-32-NEXT:    i32.add $push3=, $pop1, $pop2
+; CHECK-32-NEXT:    i32.const $push4=, -16
+; CHECK-32-NEXT:    i32.and $push5=, $pop3, $pop4
+; CHECK-32-NEXT:    i32.sub $push7=, $pop11, $pop5
+; CHECK-32-NEXT:    local.tee $push12=, 0, $pop7
+; CHECK-32-NEXT:    drop $pop12
+; CHECK-32-NEXT:    local.get $push13=, 0
+; CHECK-32-NEXT:    i32.const $push6=, 0
+; CHECK-32-NEXT:    i32.store 0($pop13), $pop6
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: dynamic_alloca_redzone:
+; CHECK-64:         .functype dynamic_alloca_redzone (i32) -> ()
+; CHECK-64-NEXT:    .local i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push9=, __stack_pointer
+; CHECK-64-NEXT:    local.tee $push10=, 1, $pop9
+; CHECK-64-NEXT:    drop $pop10
+; CHECK-64-NEXT:    local.get $push12=, 1
+; CHECK-64-NEXT:    local.get $push11=, 0
+; CHECK-64-NEXT:    i64.extend_i32_u $push0=, $pop11
+; CHECK-64-NEXT:    i64.const $push1=, 2
+; CHECK-64-NEXT:    i64.shl $push2=, $pop0, $pop1
+; CHECK-64-NEXT:    i64.const $push3=, 15
+; CHECK-64-NEXT:    i64.add $push4=, $pop2, $pop3
+; CHECK-64-NEXT:    i64.const $push5=, 34359738352
+; CHECK-64-NEXT:    i64.and $push6=, $pop4, $pop5
+; CHECK-64-NEXT:    i64.sub $push8=, $pop12, $pop6
+; CHECK-64-NEXT:    local.tee $push13=, 1, $pop8
+; CHECK-64-NEXT:    drop $pop13
+; CHECK-64-NEXT:    local.get $push14=, 1
+; CHECK-64-NEXT:    i32.const $push7=, 0
+; CHECK-64-NEXT:    i32.store 0($pop14), $pop7
+; CHECK-64-NEXT:    return
  %r = alloca i32, i32 %alloc
- ; CHECK-NEXT: local.tee $push[[L8:.+]]=, [[SP2:.+]], $pop
- ; CHECK: local.get $push[[L7:.+]]=, [[SP2]]{{$}}
- ; CHECK-NEXT: i32.const $push[[L6:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.store 0($pop[[L7]]), $pop[[L6]]{{$}}
  store i32 0, ptr %r
- ; CHECK-NEXT: return
  ret void
 }
 
-; CHECK-LABEL: dynamic_static_alloca:
 define void @dynamic_static_alloca(i32 %alloc) noredzone {
  ; Decrement SP in the prolog by the static amount and writeback to memory.
- ; CHECK: global.get $push[[L11:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: i[[PTR]].const $push[[L12:.+]]=, 16
- ; CHECK-NEXT: i[[PTR]].sub $push[[L23:.+]]=, $pop[[L11]], $pop[[L12]]
- ; CHECK-NEXT: local.tee $push[[L22:.+]]=, [[SP:.+]], $pop[[L23]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L22]]
-
  ; Alloc and write to a static alloca
- ; CHECK: local.get $push[[L21:.+]]=, [[SP:.+]]
- ; CHECK-NEXT: local.tee $push[[pushedFP:.+]]=, [[FP:.+]], $pop[[L21]]
- ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 101
- ; CHECK-NEXT: i32.store [[static_offset:.+]]($pop[[pushedFP]]), $pop[[L0]]
+; CHECK-32-LABEL: dynamic_static_alloca:
+; CHECK-32:         .functype dynamic_static_alloca (i32) -> ()
+; CHECK-32-NEXT:    .local i32, i32, i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push11=, __stack_pointer
+; CHECK-32-NEXT:    i32.const $push12=, 16
+; CHECK-32-NEXT:    i32.sub $push25=, $pop11, $pop12
+; CHECK-32-NEXT:    local.tee $push24=, 1, $pop25
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop24
+; CHECK-32-NEXT:    local.get $push23=, 1
+; CHECK-32-NEXT:    local.tee $push22=, 2, $pop23
+; CHECK-32-NEXT:    i32.const $push0=, 101
+; CHECK-32-NEXT:    i32.store 12($pop22), $pop0
+; CHECK-32-NEXT:    local.get $push27=, 1
+; CHECK-32-NEXT:    local.get $push26=, 0
+; CHECK-32-NEXT:    i32.const $push1=, 2
+; CHECK-32-NEXT:    i32.shl $push2=, $pop26, $pop1
+; CHECK-32-NEXT:    i32.const $push3=, 15
+; CHECK-32-NEXT:    i32.add $push4=, $pop2, $pop3
+; CHECK-32-NEXT:    i32.const $push5=, -16
+; CHECK-32-NEXT:    i32.and $push21=, $pop4, $pop5
+; CHECK-32-NEXT:    local.tee $push20=, 0, $pop21
+; CHECK-32-NEXT:    i32.sub $push19=, $pop27, $pop20
+; CHECK-32-NEXT:    local.tee $push18=, 1, $pop19
+; CHECK-32-NEXT:    local.tee $push17=, 3, $pop18
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop17
+; CHECK-32-NEXT:    local.get $push28=, 2
+; CHECK-32-NEXT:    i32.const $push6=, 102
+; CHECK-32-NEXT:    i32.store 12($pop28), $pop6
+; CHECK-32-NEXT:    local.get $push29=, 1
+; CHECK-32-NEXT:    i32.const $push7=, 103
+; CHECK-32-NEXT:    i32.store 0($pop29), $pop7
+; CHECK-32-NEXT:    local.get $push31=, 3
+; CHECK-32-NEXT:    local.get $push30=, 0
+; CHECK-32-NEXT:    i32.sub $push16=, $pop31, $pop30
+; CHECK-32-NEXT:    local.tee $push15=, 0, $pop16
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop15
+; CHECK-32-NEXT:    local.get $push32=, 2
+; CHECK-32-NEXT:    i32.const $push8=, 104
+; CHECK-32-NEXT:    i32.store 12($pop32), $pop8
+; CHECK-32-NEXT:    local.get $push33=, 1
+; CHECK-32-NEXT:    i32.const $push9=, 105
+; CHECK-32-NEXT:    i32.store 0($pop33), $pop9
+; CHECK-32-NEXT:    local.get $push34=, 0
+; CHECK-32-NEXT:    i32.const $push10=, 106
+; CHECK-32-NEXT:    i32.store 0($pop34), $pop10
+; CHECK-32-NEXT:    local.get $push35=, 2
+; CHECK-32-NEXT:    i32.const $push13=, 16
+; CHECK-32-NEXT:    i32.add $push14=, $pop35, $pop13
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop14
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: dynamic_static_alloca:
+; CHECK-64:         .functype dynamic_static_alloca (i32) -> ()
+; CHECK-64-NEXT:    .local i64, i64, i64, i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push12=, __stack_pointer
+; CHECK-64-NEXT:    i64.const $push13=, 16
+; CHECK-64-NEXT:    i64.sub $push26=, $pop12, $pop13
+; CHECK-64-NEXT:    local.tee $push25=, 1, $pop26
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop25
+; CHECK-64-NEXT:    local.get $push24=, 1
+; CHECK-64-NEXT:    local.tee $push23=, 2, $pop24
+; CHECK-64-NEXT:    i32.const $push0=, 101
+; CHECK-64-NEXT:    i32.store 12($pop23), $pop0
+; CHECK-64-NEXT:    local.get $push28=, 1
+; CHECK-64-NEXT:    local.get $push27=, 0
+; CHECK-64-NEXT:    i64.extend_i32_u $push1=, $pop27
+; CHECK-64-NEXT:    i64.const $push2=, 2
+; CHECK-64-NEXT:    i64.shl $push3=, $pop1, $pop2
+; CHECK-64-NEXT:    i64.const $push4=, 15
+; CHECK-64-NEXT:    i64.add $push5=, $pop3, $pop4
+; CHECK-64-NEXT:    i64.const $push6=, 34359738352
+; CHECK-64-NEXT:    i64.and $push22=, $pop5, $pop6
+; CHECK-64-NEXT:    local.tee $push21=, 3, $pop22
+; CHECK-64-NEXT:    i64.sub $push20=, $pop28, $pop21
+; CHECK-64-NEXT:    local.tee $push19=, 1, $pop20
+; CHECK-64-NEXT:    local.tee $push18=, 4, $pop19
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop18
+; CHECK-64-NEXT:    local.get $push29=, 2
+; CHECK-64-NEXT:    i32.const $push7=, 102
+; CHECK-64-NEXT:    i32.store 12($pop29), $pop7
+; CHECK-64-NEXT:    local.get $push30=, 1
+; CHECK-64-NEXT:    i32.const $push8=, 103
+; CHECK-64-NEXT:    i32.store 0($pop30), $pop8
+; CHECK-64-NEXT:    local.get $push32=, 4
+; CHECK-64-NEXT:    local.get $push31=, 3
+; CHECK-64-NEXT:    i64.sub $push17=, $pop32, $pop31
+; CHECK-64-NEXT:    local.tee $push16=, 3, $pop17
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop16
+; CHECK-64-NEXT:    local.get $push33=, 2
+; CHECK-64-NEXT:    i32.const $push9=, 104
+; CHECK-64-NEXT:    i32.store 12($pop33), $pop9
+; CHECK-64-NEXT:    local.get $push34=, 1
+; CHECK-64-NEXT:    i32.const $push10=, 105
+; CHECK-64-NEXT:    i32.store 0($pop34), $pop10
+; CHECK-64-NEXT:    local.get $push35=, 3
+; CHECK-64-NEXT:    i32.const $push11=, 106
+; CHECK-64-NEXT:    i32.store 0($pop35), $pop11
+; CHECK-64-NEXT:    local.get $push36=, 2
+; CHECK-64-NEXT:    i64.const $push14=, 16
+; CHECK-64-NEXT:    i64.add $push15=, $pop36, $pop14
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop15
+; CHECK-64-NEXT:    return
  %static = alloca i32
  store volatile i32 101, ptr %static
-
  ; Decrement SP in the body by the dynamic amount.
- ; CHECK: i[[PTR]].sub
- ; CHECK: local.tee $push[[L16:.+]]=, [[dynamic_local:.+]], $pop{{.+}}
- ; CHECK: local.tee $push[[L15:.+]]=, [[other:.+]], $pop[[L16]]{{$}}
- ; CHECK: global.set __stack_pointer, $pop[[L15]]{{$}}
  %dynamic = alloca i32, i32 %alloc
-
  ; Ensure we don't modify the frame pointer after assigning it.
- ; CHECK-NOT: $[[FP]]=
-
  ; Ensure the static address doesn't change after modifying the stack pointer.
- ; CHECK: local.get $push[[L17:.+]]=, [[FP]]
- ; CHECK: i32.const $push[[L7:.+]]=, 102
- ; CHECK-NEXT: i32.store [[static_offset]]($pop[[L17]]), $pop[[L7]]
- ; CHECK-NEXT: local.get $push[[L9:.+]]=, [[dynamic_local]]{{$}}
- ; CHECK-NEXT: i32.const $push[[L8:.+]]=, 103
- ; CHECK-NEXT: i32.store 0($pop[[L9]]), $pop[[L8]]
  store volatile i32 102, ptr %static
  store volatile i32 103, ptr %dynamic
-
  ; Decrement SP in the body by the dynamic amount.
- ; CHECK: i[[PTR]].sub
- ; CHECK: local.tee $push{{.+}}=, [[dynamic2_local:.+]], $pop{{.+}}
  %dynamic.2 = alloca i32, i32 %alloc
-
- ; CHECK-NOT: $[[FP]]=
-
  ; Ensure neither the static nor dynamic address changes after the second
  ; modification of the stack pointer.
- ; CHECK: local.get $push[[L22:.+]]=, [[FP]]
- ; CHECK: i32.const $push[[L9:.+]]=, 104
- ; CHECK-NEXT: i32.store [[static_offset]]($pop[[L22]]), $pop[[L9]]
- ; CHECK-NEXT: local.get $push[[L23:.+]]=, [[dynamic_local]]
- ; CHECK-NEXT: i32.const $push[[L10:.+]]=, 105
- ; CHECK-NEXT: i32.store 0($pop[[L23]]), $pop[[L10]]
- ; CHECK-NEXT: local.get $push[[L23:.+]]=, [[dynamic2_local]]
- ; CHECK-NEXT: i32.const $push[[L11:.+]]=, 106
- ; CHECK-NEXT: i32.store 0($pop[[L23]]), $pop[[L11]]
  store volatile i32 104, ptr %static
  store volatile i32 105, ptr %dynamic
  store volatile i32 106, ptr %dynamic.2
-
  ; Writeback to memory.
- ; CHECK: local.get $push[[L24:.+]]=, [[FP]]{{$}}
- ; CHECK: i[[PTR]].const $push[[L18:.+]]=, 16
- ; CHECK-NEXT: i[[PTR]].add $push[[L19:.+]]=, $pop[[L24]], $pop[[L18]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L19]]
  ret void
 }
 
 declare ptr @llvm.stacksave()
 declare void @llvm.stackrestore(ptr)
 
-; CHECK-LABEL: llvm_stack_builtins:
 define void @llvm_stack_builtins(i32 %alloc) noredzone {
- ; CHECK: global.get $push[[L11:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: local.tee $push[[L10:.+]]=, {{.+}}, $pop[[L11]]
- ; CHECK-NEXT: local.set [[STACK:.+]], $pop[[L10]]
+; CHECK-32-LABEL: llvm_stack_builtins:
+; CHECK-32:         .functype llvm_stack_builtins (i32) -> ()
+; CHECK-32-NEXT:    .local i32, i32, i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push7=, __stack_pointer
+; CHECK-32-NEXT:    local.tee $push8=, 1, $pop7
+; CHECK-32-NEXT:    local.set 2, $pop8
+; CHECK-32-NEXT:    local.get $push9=, 1
+; CHECK-32-NEXT:    local.set 3, $pop9
+; CHECK-32-NEXT:    local.get $push11=, 1
+; CHECK-32-NEXT:    local.get $push10=, 0
+; CHECK-32-NEXT:    i32.const $push0=, 2
+; CHECK-32-NEXT:    i32.shl $push1=, $pop10, $pop0
+; CHECK-32-NEXT:    i32.const $push2=, 15
+; CHECK-32-NEXT:    i32.add $push3=, $pop1, $pop2
+; CHECK-32-NEXT:    i32.const $push4=, -16
+; CHECK-32-NEXT:    i32.and $push5=, $pop3, $pop4
+; CHECK-32-NEXT:    i32.sub $push6=, $pop11, $pop5
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-32-NEXT:    local.get $push12=, 3
+; CHECK-32-NEXT:    drop $pop12
+; CHECK-32-NEXT:    local.get $push13=, 2
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop13
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: llvm_stack_builtins:
+; CHECK-64:         .functype llvm_stack_builtins (i32) -> ()
+; CHECK-64-NEXT:    .local i64, i64, i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push8=, __stack_pointer
+; CHECK-64-NEXT:    local.tee $push9=, 1, $pop8
+; CHECK-64-NEXT:    local.set 2, $pop9
+; CHECK-64-NEXT:    local.get $push10=, 1
+; CHECK-64-NEXT:    local.set 3, $pop10
+; CHECK-64-NEXT:    local.get $push12=, 1
+; CHECK-64-NEXT:    local.get $push11=, 0
+; CHECK-64-NEXT:    i64.extend_i32_u $push0=, $pop11
+; CHECK-64-NEXT:    i64.const $push1=, 2
+; CHECK-64-NEXT:    i64.shl $push2=, $pop0, $pop1
+; CHECK-64-NEXT:    i64.const $push3=, 15
+; CHECK-64-NEXT:    i64.add $push4=, $pop2, $pop3
+; CHECK-64-NEXT:    i64.const $push5=, 34359738352
+; CHECK-64-NEXT:    i64.and $push6=, $pop4, $pop5
+; CHECK-64-NEXT:    i64.sub $push7=, $pop12, $pop6
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-64-NEXT:    local.get $push13=, 3
+; CHECK-64-NEXT:    drop $pop13
+; CHECK-64-NEXT:    local.get $push14=, 2
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop14
+; CHECK-64-NEXT:    return
  %stack = call ptr @llvm.stacksave()
-
  ; Ensure we don't reassign the stacksave local
- ; CHECK-NOT: local.set [[STACK]],
  %dynamic = alloca i32, i32 %alloc
-
- ; CHECK: local.get $push[[L12:.+]]=, [[STACK]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L12]]
  call void @llvm.stackrestore(ptr %stack)
-
  ret void
 }
 
 ; Not actually using the alloca'd variables exposed an issue with register
 ; stackification, where copying the stack pointer into the frame pointer was
 ; moved after the stack pointer was updated for the dynamic alloca.
-; CHECK-LABEL: dynamic_alloca_nouse:
 define void @dynamic_alloca_nouse(i32 %alloc) noredzone {
- ; CHECK: global.get $push[[L11:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: local.tee $push[[L10:.+]]=, {{.+}}, $pop[[L11]]
- ; CHECK-NEXT: local.set [[FP:.+]], $pop[[L10]]
+; CHECK-32-LABEL: dynamic_alloca_nouse:
+; CHECK-32:         .functype dynamic_alloca_nouse (i32) -> ()
+; CHECK-32-NEXT:    .local i32, i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push7=, __stack_pointer
+; CHECK-32-NEXT:    local.tee $push8=, 1, $pop7
+; CHECK-32-NEXT:    local.set 2, $pop8
+; CHECK-32-NEXT:    local.get $push10=, 1
+; CHECK-32-NEXT:    local.get $push9=, 0
+; CHECK-32-NEXT:    i32.const $push0=, 2
+; CHECK-32-NEXT:    i32.shl $push1=, $pop9, $pop0
+; CHECK-32-NEXT:    i32.const $push2=, 15
+; CHECK-32-NEXT:    i32.add $push3=, $pop1, $pop2
+; CHECK-32-NEXT:    i32.const $push4=, -16
+; CHECK-32-NEXT:    i32.and $push5=, $pop3, $pop4
+; CHECK-32-NEXT:    i32.sub $push6=, $pop10, $pop5
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-32-NEXT:    local.get $push11=, 2
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop11
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: dynamic_alloca_nouse:
+; CHECK-64:         .functype dynamic_alloca_nouse (i32) -> ()
+; CHECK-64-NEXT:    .local i64, i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push8=, __stack_pointer
+; CHECK-64-NEXT:    local.tee $push9=, 1, $pop8
+; CHECK-64-NEXT:    local.set 2, $pop9
+; CHECK-64-NEXT:    local.get $push11=, 1
+; CHECK-64-NEXT:    local.get $push10=, 0
+; CHECK-64-NEXT:    i64.extend_i32_u $push0=, $pop10
+; CHECK-64-NEXT:    i64.const $push1=, 2
+; CHECK-64-NEXT:    i64.shl $push2=, $pop0, $pop1
+; CHECK-64-NEXT:    i64.const $push3=, 15
+; CHECK-64-NEXT:    i64.add $push4=, $pop2, $pop3
+; CHECK-64-NEXT:    i64.const $push5=, 34359738352
+; CHECK-64-NEXT:    i64.and $push6=, $pop4, $pop5
+; CHECK-64-NEXT:    i64.sub $push7=, $pop11, $pop6
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-64-NEXT:    local.get $push12=, 2
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop12
+; CHECK-64-NEXT:    return
  %dynamic = alloca i32, i32 %alloc
-
- ; CHECK-NOT: local.set [[FP]],
-
- ; CHECK: local.get $push[[L12:.+]]=, [[FP]]
- ; CHECK-NEXT: global.set __stack_pointer, $pop[[L12]]
  ret void
 }
 
 ; The use of the alloca in a phi causes a CopyToReg DAG node to be generated,
 ; which has to have special handling because CopyToReg can't have a FI operand
-; CHECK-LABEL: copytoreg_fi:
 define void @copytoreg_fi(i1 %cond, ptr %b) {
+; CHECK-32-LABEL: copytoreg_fi:
+; CHECK-32:         .functype copytoreg_fi (i32, i32) -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0: # %entry
+; CHECK-32-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-32-NEXT:    i32.const $push1=, 16
+; CHECK-32-NEXT:    i32.sub $push3=, $pop0, $pop1
+; CHECK-32-NEXT:    i32.const $push2=, 12
+; CHECK-32-NEXT:    i32.add $push6=, $pop3, $pop2
+; CHECK-32-NEXT:    local.set 2, $pop6
+; CHECK-32-NEXT:    local.get $push8=, 0
+; CHECK-32-NEXT:    i32.const $push4=, 1
+; CHECK-32-NEXT:    i32.and $push7=, $pop8, $pop4
+; CHECK-32-NEXT:    local.set 0, $pop7
+; CHECK-32-NEXT:  .LBB10_1: # %body
+; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-32-NEXT:    loop # label0:
+; CHECK-32-NEXT:    local.get $push9=, 2
+; CHECK-32-NEXT:    i32.const $push5=, 1
+; CHECK-32-NEXT:    i32.store 0($pop9), $pop5
+; CHECK-32-NEXT:    local.get $push10=, 1
+; CHECK-32-NEXT:    local.set 2, $pop10
+; CHECK-32-NEXT:    local.get $push11=, 0
+; CHECK-32-NEXT:    br_if 0, $pop11 # 0: up to label0
+; CHECK-32-NEXT:  # %bb.2: # %exit
+; CHECK-32-NEXT:    end_loop
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: copytoreg_fi:
+; CHECK-64:         .functype copytoreg_fi (i32, i64) -> ()
+; CHECK-64-NEXT:    .local i64
+; CHECK-64-NEXT:  # %bb.0: # %entry
+; CHECK-64-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-64-NEXT:    i64.const $push1=, 16
+; CHECK-64-NEXT:    i64.sub $push3=, $pop0, $pop1
+; CHECK-64-NEXT:    i64.const $push2=, 12
+; CHECK-64-NEXT:    i64.add $push6=, $pop3, $pop2
+; CHECK-64-NEXT:    local.set 2, $pop6
+; CHECK-64-NEXT:    local.get $push8=, 0
+; CHECK-64-NEXT:    i32.const $push4=, 1
+; CHECK-64-NEXT:    i32.and $push7=, $pop8, $pop4
+; CHECK-64-NEXT:    local.set 0, $pop7
+; CHECK-64-NEXT:  .LBB10_1: # %body
+; CHECK-64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-64-NEXT:    loop # label0:
+; CHECK-64-NEXT:    local.get $push9=, 2
+; CHECK-64-NEXT:    i32.const $push5=, 1
+; CHECK-64-NEXT:    i32.store 0($pop9), $pop5
+; CHECK-64-NEXT:    local.get $push10=, 1
+; CHECK-64-NEXT:    local.set 2, $pop10
+; CHECK-64-NEXT:    local.get $push11=, 0
+; CHECK-64-NEXT:    br_if 0, $pop11 # 0: up to label0
+; CHECK-64-NEXT:  # %bb.2: # %exit
+; CHECK-64-NEXT:    end_loop
+; CHECK-64-NEXT:    return
 entry:
- ; CHECK: i[[PTR]].const $push[[L1:.+]]=, 16
- ; CHECK-NEXT: i[[PTR]].sub $push[[L3:.+]]=, {{.+}}, $pop[[L1]]
  %addr = alloca i32
- ; CHECK: i[[PTR]].const $push[[OFF:.+]]=, 12
- ; CHECK-NEXT: i[[PTR]].add $push[[ADDR:.+]]=, $pop[[L3]], $pop[[OFF]]
- ; CHECK-NEXT: local.set [[COPY:.+]], $pop[[ADDR]]
  br label %body
 body:
  %a = phi ptr [%addr, %entry], [%b, %body]
  store i32 1, ptr %a
- ; CHECK: local.get $push[[L12:.+]]=, [[COPY]]
- ; CHECK: i32.store 0($pop[[L12]]),
  br i1 %cond, label %body, label %exit
 exit:
  ret void
@@ -291,37 +673,84 @@ declare void @use_i8_star(ptr)
 declare ptr @llvm.frameaddress(i32)
 
 ; Test __builtin_frame_address(0).
-; CHECK-LABEL: frameaddress_0:
-; CHECK: global.get $push[[L3:.+]]=, __stack_pointer{{$}}
-; CHECK-NEXT: local.tee $push[[L2:.+]]=, [[FP:.+]], $pop[[L3]]{{$}}
-; CHECK-NEXT: call use_i8_star, $pop[[L2]]
-; CHECK-NEXT: local.get $push[[L5:.+]]=, [[FP]]
-; CHECK-NEXT: global.set __stack_pointer, $pop[[L5]]
 define void @frameaddress_0() {
+; CHECK-32-LABEL: frameaddress_0:
+; CHECK-32:         .functype frameaddress_0 () -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push1=, __stack_pointer
+; CHECK-32-NEXT:    local.tee $push0=, 0, $pop1
+; CHECK-32-NEXT:    call use_i8_star, $pop0
+; CHECK-32-NEXT:    local.get $push2=, 0
+; CHECK-32-NEXT:    global.set __stack_pointer, $pop2
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: frameaddress_0:
+; CHECK-64:         .functype frameaddress_0 () -> ()
+; CHECK-64-NEXT:    .local i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push1=, __stack_pointer
+; CHECK-64-NEXT:    local.tee $push0=, 0, $pop1
+; CHECK-64-NEXT:    call use_i8_star, $pop0
+; CHECK-64-NEXT:    local.get $push2=, 0
+; CHECK-64-NEXT:    global.set __stack_pointer, $pop2
+; CHECK-64-NEXT:    return
   %t = call ptr @llvm.frameaddress(i32 0)
   call void @use_i8_star(ptr %t)
   ret void
 }
 
 ; Test __builtin_frame_address(1).
-
-; CHECK-LABEL: frameaddress_1:
-; CHECK:      i[[PTR]].const $push0=, 0{{$}}
-; CHECK-NEXT: call use_i8_star, $pop0{{$}}
-; CHECK-NEXT: return{{$}}
 define void @frameaddress_1() {
+; CHECK-32-LABEL: frameaddress_1:
+; CHECK-32:         .functype frameaddress_1 () -> ()
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    i32.const $push0=, 0
+; CHECK-32-NEXT:    call use_i8_star, $pop0
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: frameaddress_1:
+; CHECK-64:         .functype frameaddress_1 () -> ()
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    i64.const $push0=, 0
+; CHECK-64-NEXT:    call use_i8_star, $pop0
+; CHECK-64-NEXT:    return
   %t = call ptr @llvm.frameaddress(i32 1)
   call void @use_i8_star(ptr %t)
   ret void
 }
 
 ; Test a stack address passed to an inline asm.
-; CHECK-LABEL: inline_asm:
-; CHECK:       global.get {{.+}}, __stack_pointer{{$}}
-; CHECK:       #APP
-; CHECK-NEXT:  # %{{[0-9]+}}{{$}}
-; CHECK-NEXT:  #NO_APP
 define void @inline_asm() {
+; CHECK-32-LABEL: inline_asm:
+; CHECK-32:         .functype inline_asm () -> ()
+; CHECK-32-NEXT:    .local i32
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-32-NEXT:    i32.const $push1=, 16
+; CHECK-32-NEXT:    i32.sub $push3=, $pop0, $pop1
+; CHECK-32-NEXT:    i32.const $push2=, 15
+; CHECK-32-NEXT:    i32.add $push4=, $pop3, $pop2
+; CHECK-32-NEXT:    local.set 0, $pop4
+; CHECK-32-NEXT:    #APP
+; CHECK-32-NEXT:    # %0
+; CHECK-32-NEXT:    #NO_APP
+; CHECK-32-NEXT:    return
+;
+; CHECK-64-LABEL: inline_asm:
+; CHECK-64:         .functype inline_asm () -> ()
+; CHECK-64-NEXT:    .local i64
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-64-NEXT:    i64.const $push1=, 16
+; CHECK-64-NEXT:    i64.sub $push3=, $pop0, $pop1
+; CHECK-64-NEXT:    i64.const $push2=, 15
+; CHECK-64-NEXT:    i64.add $push4=, $pop3, $pop2
+; CHECK-64-NEXT:    local.set 0, $pop4
+; CHECK-64-NEXT:    #APP
+; CHECK-64-NEXT:    # %0
+; CHECK-64-NEXT:    #NO_APP
+; CHECK-64-NEXT:    return
   %tmp = alloca i8
   call void asm sideeffect "# %0", "r"(ptr %tmp)
   ret void
@@ -330,10 +759,38 @@ define void @inline_asm() {
 ; We optimize the format of "frame offset + operand" by folding it, but this is
 ; only possible when that operand is an immediate. In this example it is a
 ; global address, so we should not fold it.
-; CHECK-LABEL: frame_offset_with_global_address
-; CHECK: i[[PTR]].const ${{.*}}=, str
 @str = local_unnamed_addr global [3 x i8] c"abc", align 16
 define i8 @frame_offset_with_global_address() {
+; CHECK-32-LABEL: frame_offset_with_global_address:
+; CHECK-32:         .functype frame_offset_with_global_address () -> (i32)
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    i32.const $push0=, str
+; CHECK-32-NEXT:    global.get $push5=, __stack_pointer
+; CHECK-32-NEXT:    i32.const $push6=, 16
+; CHECK-32-NEXT:    i32.sub $push9=, $pop5, $pop6
+; CHECK-32-NEXT:    i32.const $push7=, 12
+; CHECK-32-NEXT:    i32.add $push8=, $pop9, $pop7
+; CHECK-32-NEXT:    i32.add $push1=, $pop0, $pop8
+; CHECK-32-NEXT:    i32.load8_u $push2=, 0($pop1)
+; CHECK-32-NEXT:    i32.const $push3=, 67
+; CHECK-32-NEXT:    i32.and $push4=, $pop2, $pop3
+; CHECK-32-NEXT:    return $pop4
+;
+; CHECK-64-LABEL: frame_offset_with_global_address:
+; CHECK-64:         .functype frame_offset_with_global_address () -> (i32)
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    i64.const $push1=, str
+; CHECK-64-NEXT:    global.get $push6=, __stack_pointer
+; CHECK-64-NEXT:    i64.const $push7=, 16
+; CHECK-64-NEXT:    i64.sub $push10=, $pop6, $pop7
+; CHECK-64-NEXT:    i64.const $push8=, 12
+; CHECK-64-NEXT:    i64.add $push9=, $pop10, $pop8
+; CHECK-64-NEXT:    i64.extend32_s $push0=, $pop9
+; CHECK-64-NEXT:    i64.add $push2=, $pop1, $pop0
+; CHECK-64-NEXT:    i32.load8_u $push3=, 0($pop2)
+; CHECK-64-NEXT:    i32.const $push4=, 67
+; CHECK-64-NEXT:    i32.and $push5=, $pop3, $pop4
+; CHECK-64-NEXT:    return $pop5
   %1 = alloca i8, align 4
   %2 = ptrtoint ptr %1 to i32
   ;; Here @str is a global address and not an immediate, so cannot be folded


        


More information about the llvm-commits mailing list