[llvm] 51cd326 - [WebAssembly] Autogenerate checks in simd-offset.ll

Thomas Lively via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 22 10:12:35 PDT 2020


Author: Thomas Lively
Date: 2020-07-22T10:12:26-07:00
New Revision: 51cd326f99bb328cb2c9ca1f6ccc28c11e73a3c1

URL: https://github.com/llvm/llvm-project/commit/51cd326f99bb328cb2c9ca1f6ccc28c11e73a3c1
DIFF: https://github.com/llvm/llvm-project/commit/51cd326f99bb328cb2c9ca1f6ccc28c11e73a3c1.diff

LOG: [WebAssembly] Autogenerate checks in simd-offset.ll

Implementing new functionality tested in this file requires adding new
tests for many IR addressing patterns, which can be a large
maintenance burden. This patch makes adding tests easier by switching
to using autogenerated checks. This patch also removes the testing
mode that has simd128 disabled because it would produce very large
checks and is not particularly interesting.

Differential Revision: https://reviews.llvm.org/D84288

Added: 
    

Modified: 
    llvm/test/CodeGen/WebAssembly/simd-offset.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll
index 7ece5b782ab5..933897286eb5 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
 
 ; Test SIMD loads and stores
 
@@ -9,34 +9,37 @@ target triple = "wasm32-unknown-unknown"
 ; ==============================================================================
 ; 16 x i8
 ; ==============================================================================
-; CHECK-LABEL: load_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v16i8 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8(<16 x i8>* %p) {
+; CHECK-LABEL: load_v16i8:
+; CHECK:         .functype load_v16i8 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <16 x i8>, <16 x i8>* %p
   ret <16 x i8> %v
 }
 
-; CHECK-LABEL: load_splat_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v16i8 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v8x16.load_splat $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_splat_v16i8(i8* %p) {
+; CHECK-LABEL: load_splat_v16i8:
+; CHECK:         .functype load_splat_v16i8 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v8x16.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %e = load i8, i8* %p
   %v1 = insertelement <16 x i8> undef, i8 %e, i32 0
   %v2 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %v2
 }
 
-; CHECK-LABEL: load_v16i8_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v16i8_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_folded_offset(<16 x i8>* %p) {
+; CHECK-LABEL: load_v16i8_with_folded_offset:
+; CHECK:         .functype load_v16i8_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <16 x i8>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <16 x i8>*
@@ -44,12 +47,13 @@ define <16 x i8> @load_v16i8_with_folded_offset(<16 x i8>* %p) {
   ret <16 x i8> %v
 }
 
-; CHECK-LABEL: load_splat_v16i8_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v16i8_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v8x16.load_splat $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_splat_v16i8_with_folded_offset(i8* %p) {
+; CHECK-LABEL: load_splat_v16i8_with_folded_offset:
+; CHECK:         .functype load_splat_v16i8_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v8x16.load_splat 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint i8* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to i8*
@@ -59,23 +63,25 @@ define <16 x i8> @load_splat_v16i8_with_folded_offset(i8* %p) {
   ret <16 x i8> %v2
 }
 
-; CHECK-LABEL: load_v16i8_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v16i8_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_folded_gep_offset(<16 x i8>* %p) {
+; CHECK-LABEL: load_v16i8_with_folded_gep_offset:
+; CHECK:         .functype load_v16i8_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 1
   %v = load <16 x i8>, <16 x i8>* %s
   ret <16 x i8> %v
 }
 
-; CHECK-LABEL: load_splat_v16i8_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v16i8_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v8x16.load_splat $push[[R:[0-9]+]]=, 1($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_splat_v16i8_with_folded_gep_offset(i8* %p) {
+; CHECK-LABEL: load_splat_v16i8_with_folded_gep_offset:
+; CHECK:         .functype load_splat_v16i8_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v8x16.load_splat 1
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i8, i8* %p, i32 1
   %e = load i8, i8* %s
   %v1 = insertelement <16 x i8> undef, i8 %e, i32 0
@@ -83,27 +89,29 @@ define <16 x i8> @load_splat_v16i8_with_folded_gep_offset(i8* %p) {
   ret <16 x i8> %v2
 }
 
-; CHECK-LABEL: load_v16i8_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v16i8_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_unfolded_gep_negative_offset(<16 x i8>* %p) {
+; CHECK-LABEL: load_v16i8_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_v16i8_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 -1
   %v = load <16 x i8>, <16 x i8>* %s
   ret <16 x i8> %v
 }
 
-; CHECK-LABEL: load_splat_v16i8_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v16i8_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v8x16.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_splat_v16i8_with_unfolded_gep_negative_offset(i8* %p) {
+; CHECK-LABEL: load_splat_v16i8_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_splat_v16i8_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v8x16.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i8, i8* %p, i32 -1
   %e = load i8, i8* %s
   %v1 = insertelement <16 x i8> undef, i8 %e, i32 0
@@ -111,14 +119,15 @@ define <16 x i8> @load_splat_v16i8_with_unfolded_gep_negative_offset(i8* %p) {
   ret <16 x i8> %v2
 }
 
-; CHECK-LABEL: load_v16i8_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v16i8_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_unfolded_offset(<16 x i8>* %p) {
+; CHECK-LABEL: load_v16i8_with_unfolded_offset:
+; CHECK:         .functype load_v16i8_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <16 x i8>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <16 x i8>*
@@ -126,14 +135,15 @@ define <16 x i8> @load_v16i8_with_unfolded_offset(<16 x i8>* %p) {
   ret <16 x i8> %v
 }
 
-; CHECK-LABEL: load_splat_v16i8_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v16i8_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v8x16.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_splat_v16i8_with_unfolded_offset(i8* %p) {
+; CHECK-LABEL: load_splat_v16i8_with_unfolded_offset:
+; CHECK:         .functype load_splat_v16i8_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v8x16.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint i8* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to i8*
@@ -143,27 +153,29 @@ define <16 x i8> @load_splat_v16i8_with_unfolded_offset(i8* %p) {
   ret <16 x i8> %v2
 }
 
-; CHECK-LABEL: load_v16i8_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v16i8_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_unfolded_gep_offset(<16 x i8>* %p) {
+; CHECK-LABEL: load_v16i8_with_unfolded_gep_offset:
+; CHECK:         .functype load_v16i8_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <16 x i8>, <16 x i8>* %p, i32 1
   %v = load <16 x i8>, <16 x i8>* %s
   ret <16 x i8> %v
 }
 
-; CHECK-LABEL: load_splat_v16i8_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v16i8_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 1{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v8x16.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_splat_v16i8_with_unfolded_gep_offset(i8* %p) {
+; CHECK-LABEL: load_splat_v16i8_with_unfolded_gep_offset:
+; CHECK:         .functype load_splat_v16i8_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v8x16.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr i8, i8* %p, i32 1
   %e = load i8, i8* %s
   %v1 = insertelement <16 x i8> undef, i8 %e, i32 0
@@ -171,25 +183,25 @@ define <16 x i8> @load_splat_v16i8_with_unfolded_gep_offset(i8* %p) {
   ret <16 x i8> %v2
 }
 
-; CHECK-LABEL: load_v16i8_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v16i8_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_from_numeric_address() {
+; CHECK-LABEL: load_v16i8_from_numeric_address:
+; CHECK:         .functype load_v16i8_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <16 x i8>*
   %v = load <16 x i8>, <16 x i8>* %s
   ret <16 x i8> %v
 }
 
-; CHECK-LABEL: load_splat_v16i8_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v16i8_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v8x16.load_splat $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_splat_v16i8_from_numeric_address() {
+; CHECK-LABEL: load_splat_v16i8_from_numeric_address:
+; CHECK:         .functype load_splat_v16i8_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v8x16.load_splat 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to i8*
   %e = load i8, i8* %s
   %v1 = insertelement <16 x i8> undef, i8 %e, i32 0
@@ -197,46 +209,52 @@ define <16 x i8> @load_splat_v16i8_from_numeric_address() {
   ret <16 x i8> %v2
 }
 
-; CHECK-LABEL: load_v16i8_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v16i8_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v16i8($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v16i8 = global <16 x i8> <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
 define <16 x i8> @load_v16i8_from_global_address() {
+; CHECK-LABEL: load_v16i8_from_global_address:
+; CHECK:         .functype load_v16i8_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load gv_v16i8
+; CHECK-NEXT:    # fallthrough-return
   %v = load <16 x i8>, <16 x i8>* @gv_v16i8
   ret <16 x i8> %v
 }
 
-; CHECK-LABEL: load_splat_v16i8_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v16i8_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v8x16.load_splat $push[[R:[0-9]+]]=, gv_i8($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_i8 = global i8 42
 define <16 x i8> @load_splat_v16i8_from_global_address() {
+; CHECK-LABEL: load_splat_v16i8_from_global_address:
+; CHECK:         .functype load_splat_v16i8_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v8x16.load_splat gv_i8
+; CHECK-NEXT:    # fallthrough-return
   %e = load i8, i8* @gv_i8
   %v1 = insertelement <16 x i8> undef, i8 %e, i32 0
   %v2 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %v2
 }
 
-; CHECK-LABEL: store_v16i8:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v16i8 (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v16i8(<16 x i8> %v, <16 x i8>* %p) {
+; CHECK-LABEL: store_v16i8:
+; CHECK:         .functype store_v16i8 (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   store <16 x i8> %v , <16 x i8>* %p
   ret void
 }
 
-; CHECK-LABEL: store_v16i8_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v16i8_with_folded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v16i8_with_folded_offset(<16 x i8> %v, <16 x i8>* %p) {
+; CHECK-LABEL: store_v16i8_with_folded_offset:
+; CHECK:         .functype store_v16i8_with_folded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <16 x i8>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <16 x i8>*
@@ -244,69 +262,85 @@ define void @store_v16i8_with_folded_offset(<16 x i8> %v, <16 x i8>* %p) {
   ret void
 }
 
-; CHECK-LABEL: store_v16i8_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v16i8_with_folded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v16i8_with_folded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
+; CHECK-LABEL: store_v16i8_with_folded_gep_offset:
+; CHECK:         .functype store_v16i8_with_folded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 1
   store <16 x i8> %v , <16 x i8>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v16i8_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v16i8_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v16i8_with_unfolded_gep_negative_offset(<16 x i8> %v, <16 x i8>* %p) {
+; CHECK-LABEL: store_v16i8_with_unfolded_gep_negative_offset:
+; CHECK:         .functype store_v16i8_with_unfolded_gep_negative_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 -1
   store <16 x i8> %v , <16 x i8>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v16i8_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v16i8_with_unfolded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v16i8_with_unfolded_offset(<16 x i8> %v, <16 x i8>* %p) {
+; CHECK-LABEL: store_v16i8_with_unfolded_offset:
+; CHECK:         .functype store_v16i8_with_unfolded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 -1
   store <16 x i8> %v , <16 x i8>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v16i8_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v16i8_with_unfolded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v16i8_with_unfolded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
+; CHECK-LABEL: store_v16i8_with_unfolded_gep_offset:
+; CHECK:         .functype store_v16i8_with_unfolded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <16 x i8>, <16 x i8>* %p, i32 1
   store <16 x i8> %v , <16 x i8>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v16i8_to_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v16i8_to_numeric_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[R]]), $0{{$}}
 define void @store_v16i8_to_numeric_address(<16 x i8> %v) {
+; CHECK-LABEL: store_v16i8_to_numeric_address:
+; CHECK:         .functype store_v16i8_to_numeric_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <16 x i8>*
   store <16 x i8> %v , <16 x i8>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v16i8_to_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v16i8_to_global_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v16i8($pop[[R]]), $0{{$}}
 define void @store_v16i8_to_global_address(<16 x i8> %v) {
+; CHECK-LABEL: store_v16i8_to_global_address:
+; CHECK:         .functype store_v16i8_to_global_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store gv_v16i8
+; CHECK-NEXT:    # fallthrough-return
   store <16 x i8> %v , <16 x i8>* @gv_v16i8
   ret void
 }
@@ -314,66 +348,72 @@ define void @store_v16i8_to_global_address(<16 x i8> %v) {
 ; ==============================================================================
 ; 8 x i16
 ; ==============================================================================
-; CHECK-LABEL: load_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v8i16 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16(<8 x i16>* %p) {
+; CHECK-LABEL: load_v8i16:
+; CHECK:         .functype load_v8i16 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i16>, <8 x i16>* %p
   ret <8 x i16> %v
 }
 
-; CHECK-LABEL: load_splat_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v8i16 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v16x8.load_splat $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_splat_v8i16(i16* %p) {
+; CHECK-LABEL: load_splat_v8i16:
+; CHECK:         .functype load_splat_v8i16 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v16x8.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %e = load i16, i16* %p
   %v1 = insertelement <8 x i16> undef, i16 %e, i32 0
   %v2 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_sext_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v8i16 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.load8x8_s $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_sext_v8i16(<8 x i8>* %p) {
+; CHECK-LABEL: load_sext_v8i16:
+; CHECK:         .functype load_sext_v8i16 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.load8x8_s 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* %p
   %v2 = sext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_zext_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v8i16 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_zext_v8i16(<8 x i8>* %p) {
+; CHECK-LABEL: load_zext_v8i16:
+; CHECK:         .functype load_zext_v8i16 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* %p
   %v2 = zext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_ext_v8i16:
-; NO-SIMD128-NOT: load8x8
-; SIMD128-NEXT: .functype load_ext_v8i16 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i8> @load_ext_v8i16(<8 x i8>* %p) {
+; CHECK-LABEL: load_ext_v8i16:
+; CHECK:         .functype load_ext_v8i16 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* %p
   ret <8 x i8> %v
 }
 
-; CHECK-LABEL: load_v8i16_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v8i16_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_folded_offset(<8 x i16>* %p) {
+; CHECK-LABEL: load_v8i16_with_folded_offset:
+; CHECK:         .functype load_v8i16_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i16>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <8 x i16>*
@@ -381,12 +421,13 @@ define <8 x i16> @load_v8i16_with_folded_offset(<8 x i16>* %p) {
   ret <8 x i16> %v
 }
 
-; CHECK-LABEL: load_splat_v8i16_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v8i16_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v16x8.load_splat $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_splat_v8i16_with_folded_offset(i16* %p) {
+; CHECK-LABEL: load_splat_v8i16_with_folded_offset:
+; CHECK:         .functype load_splat_v8i16_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v16x8.load_splat 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint i16* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to i16*
@@ -396,12 +437,13 @@ define <8 x i16> @load_splat_v8i16_with_folded_offset(i16* %p) {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_sext_v8i16_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v8i16_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.load8x8_s $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_sext_v8i16_with_folded_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_sext_v8i16_with_folded_offset:
+; CHECK:         .functype load_sext_v8i16_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.load8x8_s 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <8 x i8>*
@@ -410,12 +452,13 @@ define <8 x i16> @load_sext_v8i16_with_folded_offset(<8 x i8>* %p) {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_zext_v8i16_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v8i16_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_zext_v8i16_with_folded_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_zext_v8i16_with_folded_offset:
+; CHECK:         .functype load_zext_v8i16_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.load8x8_u 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <8 x i8>*
@@ -424,12 +467,13 @@ define <8 x i16> @load_zext_v8i16_with_folded_offset(<8 x i8>* %p) {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_ext_v8i16_with_folded_offset:
-; NO-SIMD128-NOT: load8x8
-; SIMD128-NEXT: .functype load_ext_v8i16_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i8> @load_ext_v8i16_with_folded_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_ext_v8i16_with_folded_offset:
+; CHECK:         .functype load_ext_v8i16_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.load8x8_u 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <8 x i8>*
@@ -437,23 +481,25 @@ define <8 x i8> @load_ext_v8i16_with_folded_offset(<8 x i8>* %p) {
   ret <8 x i8> %v
 }
 
-; CHECK-LABEL: load_v8i16_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v8i16_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_folded_gep_offset(<8 x i16>* %p) {
+; CHECK-LABEL: load_v8i16_with_folded_gep_offset:
+; CHECK:         .functype load_v8i16_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 1
   %v = load <8 x i16>, <8 x i16>* %s
   ret <8 x i16> %v
 }
 
-; CHECK-LABEL: load_splat_v8i16_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v8i16_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v16x8.load_splat $push[[R:[0-9]+]]=, 2($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_splat_v8i16_with_folded_gep_offset(i16* %p) {
+; CHECK-LABEL: load_splat_v8i16_with_folded_gep_offset:
+; CHECK:         .functype load_splat_v8i16_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v16x8.load_splat 2
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i16, i16* %p, i32 1
   %e = load i16, i16* %s
   %v1 = insertelement <8 x i16> undef, i16 %e, i32 0
@@ -461,62 +507,67 @@ define <8 x i16> @load_splat_v8i16_with_folded_gep_offset(i16* %p) {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_sext_v8i16_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v8i16_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.load8x8_s $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_sext_v8i16_with_folded_gep_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_sext_v8i16_with_folded_gep_offset:
+; CHECK:         .functype load_sext_v8i16_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.load8x8_s 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1
   %v = load <8 x i8>, <8 x i8>* %s
   %v2 = sext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_zext_v8i16_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v8i16_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_zext_v8i16_with_folded_gep_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_zext_v8i16_with_folded_gep_offset:
+; CHECK:         .functype load_zext_v8i16_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.load8x8_u 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1
   %v = load <8 x i8>, <8 x i8>* %s
   %v2 = zext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_ext_v8i16_with_folded_gep_offset:
-; NO-SIMD128-NOT: load8x8
-; SIMD128-NEXT: .functype load_ext_v8i16_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i8> @load_ext_v8i16_with_folded_gep_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_ext_v8i16_with_folded_gep_offset:
+; CHECK:         .functype load_ext_v8i16_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.load8x8_u 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1
   %v = load <8 x i8>, <8 x i8>* %s
   ret <8 x i8> %v
 }
 
-; CHECK-LABEL: load_v8i16_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_unfolded_gep_negative_offset(<8 x i16>* %p) {
+; CHECK-LABEL: load_v8i16_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 -1
   %v = load <8 x i16>, <8 x i16>* %s
   ret <8 x i16> %v
 }
 
-; CHECK-LABEL: load_splat_v8i16_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -2{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v16x8.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_splat_v8i16_with_unfolded_gep_negative_offset(i16* %p) {
+; CHECK-LABEL: load_splat_v8i16_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_splat_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -2
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v16x8.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i16, i16* %p, i32 -1
   %e = load i16, i16* %s
   %v1 = insertelement <8 x i16> undef, i16 %e, i32 0
@@ -524,55 +575,59 @@ define <8 x i16> @load_splat_v8i16_with_unfolded_gep_negative_offset(i16* %p) {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_sext_v8i16_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i16x8.load8x8_s $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_sext_v8i16_with_unfolded_gep_negative_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_sext_v8i16_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_sext_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i16x8.load8x8_s 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1
   %v = load <8 x i8>, <8 x i8>* %s
   %v2 = sext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_zext_v8i16_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_zext_v8i16_with_unfolded_gep_negative_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_zext_v8i16_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_zext_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1
   %v = load <8 x i8>, <8 x i8>* %s
   %v2 = zext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_ext_v8i16_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: load8x8
-; SIMD128-NEXT: .functype load_ext_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i8> @load_ext_v8i16_with_unfolded_gep_negative_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_ext_v8i16_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_ext_v8i16_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1
   %v = load <8 x i8>, <8 x i8>* %s
   ret <8 x i8> %v
 }
 
-; CHECK-LABEL: load_v8i16_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v8i16_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[L0:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[L0]]{{$}}
 define <8 x i16> @load_v8i16_with_unfolded_offset(<8 x i16>* %p) {
+; CHECK-LABEL: load_v8i16_with_unfolded_offset:
+; CHECK:         .functype load_v8i16_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i16>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <8 x i16>*
@@ -580,14 +635,15 @@ define <8 x i16> @load_v8i16_with_unfolded_offset(<8 x i16>* %p) {
   ret <8 x i16> %v
 }
 
-; CHECK-LABEL: load_splat_v8i16_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v8i16_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v16x8.load_splat $push[[L0:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[L0]]{{$}}
 define <8 x i16> @load_splat_v8i16_with_unfolded_offset(i16* %p) {
+; CHECK-LABEL: load_splat_v8i16_with_unfolded_offset:
+; CHECK:         .functype load_splat_v8i16_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v16x8.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint i16* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to i16*
@@ -597,14 +653,15 @@ define <8 x i16> @load_splat_v8i16_with_unfolded_offset(i16* %p) {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_sext_v8i16_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v8i16_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i16x8.load8x8_s $push[[L0:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[L0]]{{$}}
 define <8 x i16> @load_sext_v8i16_with_unfolded_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_sext_v8i16_with_unfolded_offset:
+; CHECK:         .functype load_sext_v8i16_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i16x8.load8x8_s 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <8 x i8>*
@@ -613,14 +670,15 @@ define <8 x i16> @load_sext_v8i16_with_unfolded_offset(<8 x i8>* %p) {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_zext_v8i16_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v8i16_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[L0:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[L0]]{{$}}
 define <8 x i16> @load_zext_v8i16_with_unfolded_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_zext_v8i16_with_unfolded_offset:
+; CHECK:         .functype load_zext_v8i16_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <8 x i8>*
@@ -629,14 +687,15 @@ define <8 x i16> @load_zext_v8i16_with_unfolded_offset(<8 x i8>* %p) {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_ext_v8i16_with_unfolded_offset:
-; NO-SIMD128-NOT: load8x8
-; SIMD128-NEXT: .functype load_ext_v8i16_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[L0:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[L0]]{{$}}
 define <8 x i8> @load_ext_v8i16_with_unfolded_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_ext_v8i16_with_unfolded_offset:
+; CHECK:         .functype load_ext_v8i16_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <8 x i8>*
@@ -644,27 +703,29 @@ define <8 x i8> @load_ext_v8i16_with_unfolded_offset(<8 x i8>* %p) {
   ret <8 x i8> %v
 }
 
-; CHECK-LABEL: load_v8i16_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v8i16_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_unfolded_gep_offset(<8 x i16>* %p) {
+; CHECK-LABEL: load_v8i16_with_unfolded_gep_offset:
+; CHECK:         .functype load_v8i16_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <8 x i16>, <8 x i16>* %p, i32 1
   %v = load <8 x i16>, <8 x i16>* %s
   ret <8 x i16> %v
 }
 
-; CHECK-LABEL: load_splat_v8i16_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v8i16_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 2{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v16x8.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_splat_v8i16_with_unfolded_gep_offset(i16* %p) {
+; CHECK-LABEL: load_splat_v8i16_with_unfolded_gep_offset:
+; CHECK:         .functype load_splat_v8i16_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 2
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v16x8.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr i16, i16* %p, i32 1
   %e = load i16, i16* %s
   %v1 = insertelement <8 x i16> undef, i16 %e, i32 0
@@ -672,66 +733,69 @@ define <8 x i16> @load_splat_v8i16_with_unfolded_gep_offset(i16* %p) {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_sext_v8i16_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v8i16_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i16x8.load8x8_s $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_sext_v8i16_with_unfolded_gep_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_sext_v8i16_with_unfolded_gep_offset:
+; CHECK:         .functype load_sext_v8i16_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i16x8.load8x8_s 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1
   %v = load <8 x i8>, <8 x i8>* %s
   %v2 = sext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_zext_v8i16_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v8i16_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_zext_v8i16_with_unfolded_gep_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_zext_v8i16_with_unfolded_gep_offset:
+; CHECK:         .functype load_zext_v8i16_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1
   %v = load <8 x i8>, <8 x i8>* %s
   %v2 = zext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_ext_v8i16_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: load8x8
-; SIMD128-NEXT: .functype load_ext_v8i16_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i8> @load_ext_v8i16_with_unfolded_gep_offset(<8 x i8>* %p) {
+; CHECK-LABEL: load_ext_v8i16_with_unfolded_gep_offset:
+; CHECK:         .functype load_ext_v8i16_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1
   %v = load <8 x i8>, <8 x i8>* %s
   ret <8 x i8> %v
 }
 
-; CHECK-LABEL: load_v8i16_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v8i16_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_from_numeric_address() {
+; CHECK-LABEL: load_v8i16_from_numeric_address:
+; CHECK:         .functype load_v8i16_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <8 x i16>*
   %v = load <8 x i16>, <8 x i16>* %s
   ret <8 x i16> %v
 }
 
-; CHECK-LABEL: load_splat_v8i16_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v8i16_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v16x8.load_splat $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_splat_v8i16_from_numeric_address() {
+; CHECK-LABEL: load_splat_v8i16_from_numeric_address:
+; CHECK:         .functype load_splat_v8i16_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v16x8.load_splat 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to i16*
   %e = load i16, i16* %s
   %v1 = insertelement <8 x i16> undef, i16 %e, i32 0
@@ -739,121 +803,127 @@ define <8 x i16> @load_splat_v8i16_from_numeric_address() {
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_sext_v8i16_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v8i16_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i16x8.load8x8_s $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_sext_v8i16_from_numeric_address() {
+; CHECK-LABEL: load_sext_v8i16_from_numeric_address:
+; CHECK:         .functype load_sext_v8i16_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i16x8.load8x8_s 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <8 x i8>*
   %v = load <8 x i8>, <8 x i8>* %s
   %v2 = sext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_zext_v8i16_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v8i16_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_zext_v8i16_from_numeric_address() {
+; CHECK-LABEL: load_zext_v8i16_from_numeric_address:
+; CHECK:         .functype load_zext_v8i16_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i16x8.load8x8_u 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <8 x i8>*
   %v = load <8 x i8>, <8 x i8>* %s
   %v2 = zext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_ext_v8i16_from_numeric_address:
-; NO-SIMD128-NOT: load8x8
-; SIMD128-NEXT: .functype load_ext_v8i16_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i8> @load_ext_v8i16_from_numeric_address() {
+; CHECK-LABEL: load_ext_v8i16_from_numeric_address:
+; CHECK:         .functype load_ext_v8i16_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i16x8.load8x8_u 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <8 x i8>*
   %v = load <8 x i8>, <8 x i8>* %s
   ret <8 x i8> %v
 }
 
-; CHECK-LABEL: load_v8i16_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v8i16_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v8i16($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v8i16 = global <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
 define <8 x i16> @load_v8i16_from_global_address() {
+; CHECK-LABEL: load_v8i16_from_global_address:
+; CHECK:         .functype load_v8i16_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load gv_v8i16
+; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i16>, <8 x i16>* @gv_v8i16
   ret <8 x i16> %v
 }
 
-; CHECK-LABEL: load_splat_v8i16_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v8i16_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v16x8.load_splat $push[[R:[0-9]+]]=, gv_i16($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_i16 = global i16 42
 define <8 x i16> @load_splat_v8i16_from_global_address() {
+; CHECK-LABEL: load_splat_v8i16_from_global_address:
+; CHECK:         .functype load_splat_v8i16_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v16x8.load_splat gv_i16
+; CHECK-NEXT:    # fallthrough-return
   %e = load i16, i16* @gv_i16
   %v1 = insertelement <8 x i16> undef, i16 %e, i32 0
   %v2 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_sext_v8i16_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v8i16_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i16x8.load8x8_s $push[[R:[0-9]+]]=, gv_v8i8($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v8i8 = global <8 x i8> <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
 define <8 x i16> @load_sext_v8i16_from_global_address() {
+; CHECK-LABEL: load_sext_v8i16_from_global_address:
+; CHECK:         .functype load_sext_v8i16_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i16x8.load8x8_s gv_v8i8
+; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* @gv_v8i8
   %v2 = sext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_zext_v8i16_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v8i16_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, gv_v8i8($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_zext_v8i16_from_global_address() {
+; CHECK-LABEL: load_zext_v8i16_from_global_address:
+; CHECK:         .functype load_zext_v8i16_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i16x8.load8x8_u gv_v8i8
+; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* @gv_v8i8
   %v2 = zext <8 x i8> %v to <8 x i16>
   ret <8 x i16> %v2
 }
 
-; CHECK-LABEL: load_ext_v8i16_from_global_address:
-; NO-SIMD128-NOT: load8x8
-; SIMD128-NEXT: .functype load_ext_v8i16_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i16x8.load8x8_u $push[[R:[0-9]+]]=, gv_v8i8($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i8> @load_ext_v8i16_from_global_address() {
+; CHECK-LABEL: load_ext_v8i16_from_global_address:
+; CHECK:         .functype load_ext_v8i16_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i16x8.load8x8_u gv_v8i8
+; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* @gv_v8i8
   ret <8 x i8> %v
 }
 
 
-; CHECK-LABEL: store_v8i16:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v8i16 (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) {
+; CHECK-LABEL: store_v8i16:
+; CHECK:         .functype store_v8i16 (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   store <8 x i16> %v , <8 x i16>* %p
   ret void
 }
 
-; CHECK-LABEL: store_v8i16_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v8i16_with_folded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
+; CHECK-LABEL: store_v8i16_with_folded_offset:
+; CHECK:         .functype store_v8i16_with_folded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i16>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <8 x i16>*
@@ -861,69 +931,85 @@ define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
   ret void
 }
 
-; CHECK-LABEL: store_v8i16_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v8i16_with_folded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
+; CHECK-LABEL: store_v8i16_with_folded_gep_offset:
+; CHECK:         .functype store_v8i16_with_folded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 1
   store <8 x i16> %v , <8 x i16>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v8i16_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v8i16_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i16>* %p) {
+; CHECK-LABEL: store_v8i16_with_unfolded_gep_negative_offset:
+; CHECK:         .functype store_v8i16_with_unfolded_gep_negative_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 -1
   store <8 x i16> %v , <8 x i16>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v8i16_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v8i16_with_unfolded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) {
+; CHECK-LABEL: store_v8i16_with_unfolded_offset:
+; CHECK:         .functype store_v8i16_with_unfolded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 -1
   store <8 x i16> %v , <8 x i16>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v8i16_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v8i16_with_unfolded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
+; CHECK-LABEL: store_v8i16_with_unfolded_gep_offset:
+; CHECK:         .functype store_v8i16_with_unfolded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <8 x i16>, <8 x i16>* %p, i32 1
   store <8 x i16> %v , <8 x i16>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v8i16_to_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v8i16_to_numeric_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v8i16_to_numeric_address(<8 x i16> %v) {
+; CHECK-LABEL: store_v8i16_to_numeric_address:
+; CHECK:         .functype store_v8i16_to_numeric_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <8 x i16>*
   store <8 x i16> %v , <8 x i16>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v8i16_to_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v8i16_to_global_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v8i16($pop[[R]]), $0{{$}}
 define void @store_v8i16_to_global_address(<8 x i16> %v) {
+; CHECK-LABEL: store_v8i16_to_global_address:
+; CHECK:         .functype store_v8i16_to_global_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store gv_v8i16
+; CHECK-NEXT:    # fallthrough-return
   store <8 x i16> %v , <8 x i16>* @gv_v8i16
   ret void
 }
@@ -931,65 +1017,72 @@ define void @store_v8i16_to_global_address(<8 x i16> %v) {
 ; ==============================================================================
 ; 4 x i32
 ; ==============================================================================
-; CHECK-LABEL: load_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4i32 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32(<4 x i32>* %p) {
+; CHECK-LABEL: load_v4i32:
+; CHECK:         .functype load_v4i32 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i32>, <4 x i32>* %p
   ret <4 x i32> %v
 }
 
-; CHECK-LABEL: load_splat_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4i32 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v32x4.load_splat
 define <4 x i32> @load_splat_v4i32(i32* %addr) {
+; CHECK-LABEL: load_splat_v4i32:
+; CHECK:         .functype load_splat_v4i32 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v32x4.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %e = load i32, i32* %addr, align 4
   %v1 = insertelement <4 x i32> undef, i32 %e, i32 0
   %v2 = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_sext_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v4i32 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.load16x4_s $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_sext_v4i32(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i32:
+; CHECK:         .functype load_sext_v4i32 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.load16x4_s 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* %p
   %v2 = sext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_zext_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v4i32 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_zext_v4i32(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i32:
+; CHECK:         .functype load_zext_v4i32 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* %p
   %v2 = zext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_ext_v4i32:
-; NO-SIMD128-NOT: load16x4
-; SIMD128-NEXT: .functype load_ext_v4i32 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i16> @load_ext_v4i32(<4 x i16>* %p) {
+; CHECK-LABEL: load_ext_v4i32:
+; CHECK:         .functype load_ext_v4i32 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* %p
   ret <4 x i16> %v
 }
 
-; CHECK-LABEL: load_v4i32_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4i32_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_folded_offset(<4 x i32>* %p) {
+; CHECK-LABEL: load_v4i32_with_folded_offset:
+; CHECK:         .functype load_v4i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i32>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <4 x i32>*
@@ -997,12 +1090,13 @@ define <4 x i32> @load_v4i32_with_folded_offset(<4 x i32>* %p) {
   ret <4 x i32> %v
 }
 
-; CHECK-LABEL: load_splat_v4i32_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4i32_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_splat_v4i32_with_folded_offset(i32* %p) {
+; CHECK-LABEL: load_splat_v4i32_with_folded_offset:
+; CHECK:         .functype load_splat_v4i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v32x4.load_splat 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint i32* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to i32*
@@ -1012,12 +1106,13 @@ define <4 x i32> @load_splat_v4i32_with_folded_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_sext_v4i32_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v4i32_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.load16x4_s $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_sext_v4i32_with_folded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i32_with_folded_offset:
+; CHECK:         .functype load_sext_v4i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.load16x4_s 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <4 x i16>*
@@ -1026,12 +1121,13 @@ define <4 x i32> @load_sext_v4i32_with_folded_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_zext_v4i32_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v4i32_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_zext_v4i32_with_folded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i32_with_folded_offset:
+; CHECK:         .functype load_zext_v4i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.load16x4_u 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <4 x i16>*
@@ -1040,12 +1136,13 @@ define <4 x i32> @load_zext_v4i32_with_folded_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_ext_v4i32_with_folded_offset:
-; NO-SIMD128-NOT: load16x4
-; SIMD128-NEXT: .functype load_ext_v4i32_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i16> @load_ext_v4i32_with_folded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_ext_v4i32_with_folded_offset:
+; CHECK:         .functype load_ext_v4i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.load16x4_u 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <4 x i16>*
@@ -1053,23 +1150,25 @@ define <4 x i16> @load_ext_v4i32_with_folded_offset(<4 x i16>* %p) {
   ret <4 x i16> %v
 }
 
-; CHECK-LABEL: load_v4i32_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4i32_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_folded_gep_offset(<4 x i32>* %p) {
+; CHECK-LABEL: load_v4i32_with_folded_gep_offset:
+; CHECK:         .functype load_v4i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 1
   %v = load <4 x i32>, <4 x i32>* %s
   ret <4 x i32> %v
 }
 
-; CHECK-LABEL: load_splat_v4i32_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4i32_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 4($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_splat_v4i32_with_folded_gep_offset(i32* %p) {
+; CHECK-LABEL: load_splat_v4i32_with_folded_gep_offset:
+; CHECK:         .functype load_splat_v4i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v32x4.load_splat 4
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i32, i32* %p, i32 1
   %e = load i32, i32* %s
   %v1 = insertelement <4 x i32> undef, i32 %e, i32 0
@@ -1077,62 +1176,67 @@ define <4 x i32> @load_splat_v4i32_with_folded_gep_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_sext_v4i32_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v4i32_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.load16x4_s $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_sext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i32_with_folded_gep_offset:
+; CHECK:         .functype load_sext_v4i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.load16x4_s 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1
   %v = load <4 x i16>, <4 x i16>* %s
   %v2 = sext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_zext_v4i32_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v4i32_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_zext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i32_with_folded_gep_offset:
+; CHECK:         .functype load_zext_v4i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.load16x4_u 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1
   %v = load <4 x i16>, <4 x i16>* %s
   %v2 = zext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_ext_v4i32_with_folded_gep_offset:
-; NO-SIMD128-NOT: load16x4
-; SIMD128-NEXT: .functype load_ext_v4i32_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i16> @load_ext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_ext_v4i32_with_folded_gep_offset:
+; CHECK:         .functype load_ext_v4i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.load16x4_u 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1
   %v = load <4 x i16>, <4 x i16>* %s
   ret <4 x i16> %v
 }
 
-; CHECK-LABEL: load_v4i32_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_unfolded_gep_negative_offset(<4 x i32>* %p) {
+; CHECK-LABEL: load_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 -1
   %v = load <4 x i32>, <4 x i32>* %s
   ret <4 x i32> %v
 }
 
-; CHECK-LABEL: load_splat_v4i32_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -4{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_splat_v4i32_with_unfolded_gep_negative_offset(i32* %p) {
+; CHECK-LABEL: load_splat_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_splat_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -4
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v32x4.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i32, i32* %p, i32 -1
   %e = load i32, i32* %s
   %v1 = insertelement <4 x i32> undef, i32 %e, i32 0
@@ -1140,55 +1244,59 @@ define <4 x i32> @load_splat_v4i32_with_unfolded_gep_negative_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i32x4.load16x4_s $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_sext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_sext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32x4.load16x4_s 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1
   %v = load <4 x i16>, <4 x i16>* %s
   %v2 = sext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_zext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_zext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1
   %v = load <4 x i16>, <4 x i16>* %s
   %v2 = zext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: load16x4
-; SIMD128-NEXT: .functype load_ext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i16> @load_ext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_ext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1
   %v = load <4 x i16>, <4 x i16>* %s
   ret <4 x i16> %v
 }
 
-; CHECK-LABEL: load_v4i32_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4i32_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_unfolded_offset(<4 x i32>* %p) {
+; CHECK-LABEL: load_v4i32_with_unfolded_offset:
+; CHECK:         .functype load_v4i32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i32>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <4 x i32>*
@@ -1196,14 +1304,15 @@ define <4 x i32> @load_v4i32_with_unfolded_offset(<4 x i32>* %p) {
   ret <4 x i32> %v
 }
 
-; CHECK-LABEL: load_splat_v4i32_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4i32_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_splat_v4i32_with_unfolded_offset(i32* %p) {
+; CHECK-LABEL: load_splat_v4i32_with_unfolded_offset:
+; CHECK:         .functype load_splat_v4i32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v32x4.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint i32* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to i32*
@@ -1213,14 +1322,15 @@ define <4 x i32> @load_splat_v4i32_with_unfolded_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_sext_v4i32_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v4i32_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i32x4.load16x4_s $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_sext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i32_with_unfolded_offset:
+; CHECK:         .functype load_sext_v4i32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32x4.load16x4_s 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <4 x i16>*
@@ -1229,14 +1339,15 @@ define <4 x i32> @load_sext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_zext_v4i32_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v4i32_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_zext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i32_with_unfolded_offset:
+; CHECK:         .functype load_zext_v4i32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <4 x i16>*
@@ -1245,14 +1356,15 @@ define <4 x i32> @load_zext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_ext_v4i32_with_unfolded_offset:
-; NO-SIMD128-NOT: load16x4
-; SIMD128-NEXT: .functype load_ext_v4i32_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i16> @load_ext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_ext_v4i32_with_unfolded_offset:
+; CHECK:         .functype load_ext_v4i32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <4 x i16>*
@@ -1260,27 +1372,29 @@ define <4 x i16> @load_ext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
   ret <4 x i16> %v
 }
 
-; CHECK-LABEL: load_v4i32_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4i32_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_unfolded_gep_offset(<4 x i32>* %p) {
+; CHECK-LABEL: load_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_v4i32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <4 x i32>, <4 x i32>* %p, i32 1
   %v = load <4 x i32>, <4 x i32>* %s
   ret <4 x i32> %v
 }
 
-; CHECK-LABEL: load_splat_v4i32_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4i32_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 4{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_splat_v4i32_with_unfolded_gep_offset(i32* %p) {
+; CHECK-LABEL: load_splat_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_splat_v4i32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 4
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v32x4.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr i32, i32* %p, i32 1
   %e = load i32, i32* %s
   %v1 = insertelement <4 x i32> undef, i32 %e, i32 0
@@ -1288,66 +1402,69 @@ define <4 x i32> @load_splat_v4i32_with_unfolded_gep_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v4i32_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i32x4.load16x4_s $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_sext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_sext_v4i32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32x4.load16x4_s 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1
   %v = load <4 x i16>, <4 x i16>* %s
   %v2 = sext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v4i32_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_zext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_zext_v4i32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1
   %v = load <4 x i16>, <4 x i16>* %s
   %v2 = zext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: load16x4
-; SIMD128-NEXT: .functype load_ext_v4i32_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i16> @load_ext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_ext_v4i32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1
   %v = load <4 x i16>, <4 x i16>* %s
   ret <4 x i16> %v
 }
 
-; CHECK-LABEL: load_v4i32_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4i32_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_from_numeric_address() {
+; CHECK-LABEL: load_v4i32_from_numeric_address:
+; CHECK:         .functype load_v4i32_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <4 x i32>*
   %v = load <4 x i32>, <4 x i32>* %s
   ret <4 x i32> %v
 }
 
-; CHECK-LABEL: load_splat_v4i32_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4i32_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_splat_v4i32_from_numeric_address() {
+; CHECK-LABEL: load_splat_v4i32_from_numeric_address:
+; CHECK:         .functype load_splat_v4i32_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v32x4.load_splat 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to i32*
   %e = load i32, i32* %s
   %v1 = insertelement <4 x i32> undef, i32 %e, i32 0
@@ -1355,120 +1472,126 @@ define <4 x i32> @load_splat_v4i32_from_numeric_address() {
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_sext_v4i32_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v4i32_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i32x4.load16x4_s $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_sext_v4i32_from_numeric_address() {
+; CHECK-LABEL: load_sext_v4i32_from_numeric_address:
+; CHECK:         .functype load_sext_v4i32_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32x4.load16x4_s 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <4 x i16>*
   %v = load <4 x i16>, <4 x i16>* %s
   %v2 = sext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_zext_v4i32_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v4i32_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_zext_v4i32_from_numeric_address() {
+; CHECK-LABEL: load_zext_v4i32_from_numeric_address:
+; CHECK:         .functype load_zext_v4i32_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32x4.load16x4_u 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <4 x i16>*
   %v = load <4 x i16>, <4 x i16>* %s
   %v2 = zext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_ext_v4i32_from_numeric_address:
-; NO-SIMD128-NOT: load16x4
-; SIMD128-NEXT: .functype load_ext_v4i32_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i16> @load_ext_v4i32_from_numeric_address() {
+; CHECK-LABEL: load_ext_v4i32_from_numeric_address:
+; CHECK:         .functype load_ext_v4i32_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32x4.load16x4_u 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <4 x i16>*
   %v = load <4 x i16>, <4 x i16>* %s
   ret <4 x i16> %v
 }
 
-; CHECK-LABEL: load_v4i32_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4i32_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4i32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v4i32 = global <4 x i32> <i32 42, i32 42, i32 42, i32 42>
 define <4 x i32> @load_v4i32_from_global_address() {
+; CHECK-LABEL: load_v4i32_from_global_address:
+; CHECK:         .functype load_v4i32_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load gv_v4i32
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i32>, <4 x i32>* @gv_v4i32
   ret <4 x i32> %v
 }
 
-; CHECK-LABEL: load_splat_v4i32_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4i32_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, gv_i32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_i32 = global i32 42
 define <4 x i32> @load_splat_v4i32_from_global_address() {
+; CHECK-LABEL: load_splat_v4i32_from_global_address:
+; CHECK:         .functype load_splat_v4i32_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v32x4.load_splat gv_i32
+; CHECK-NEXT:    # fallthrough-return
   %e = load i32, i32* @gv_i32
   %v1 = insertelement <4 x i32> undef, i32 %e, i32 0
   %v2 = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_sext_v4i32_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v4i32_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i32x4.load16x4_s $push[[R:[0-9]+]]=, gv_v4i16($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v4i16 = global <4 x i16> <i16 42, i16 42, i16 42, i16 42>
 define <4 x i32> @load_sext_v4i32_from_global_address() {
+; CHECK-LABEL: load_sext_v4i32_from_global_address:
+; CHECK:         .functype load_sext_v4i32_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32x4.load16x4_s gv_v4i16
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* @gv_v4i16
   %v2 = sext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_zext_v4i32_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v4i32_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, gv_v4i16($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_zext_v4i32_from_global_address() {
+; CHECK-LABEL: load_zext_v4i32_from_global_address:
+; CHECK:         .functype load_zext_v4i32_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32x4.load16x4_u gv_v4i16
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* @gv_v4i16
   %v2 = zext <4 x i16> %v to <4 x i32>
   ret <4 x i32> %v2
 }
 
-; CHECK-LABEL: load_ext_v4i32_from_global_address:
-; NO-SIMD128-NOT: load16x4
-; SIMD128-NEXT: .functype load_ext_v4i32_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i32x4.load16x4_u $push[[R:[0-9]+]]=, gv_v4i16($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i16> @load_ext_v4i32_from_global_address() {
+; CHECK-LABEL: load_ext_v4i32_from_global_address:
+; CHECK:         .functype load_ext_v4i32_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32x4.load16x4_u gv_v4i16
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* @gv_v4i16
   ret <4 x i16> %v
 }
 
-; CHECK-LABEL: store_v4i32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4i32 (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) {
+; CHECK-LABEL: store_v4i32:
+; CHECK:         .functype store_v4i32 (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   store <4 x i32> %v , <4 x i32>* %p
   ret void
 }
 
-; CHECK-LABEL: store_v4i32_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4i32_with_folded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
+; CHECK-LABEL: store_v4i32_with_folded_offset:
+; CHECK:         .functype store_v4i32_with_folded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i32>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <4 x i32>*
@@ -1476,69 +1599,85 @@ define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
   ret void
 }
 
-; CHECK-LABEL: store_v4i32_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4i32_with_folded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
+; CHECK-LABEL: store_v4i32_with_folded_gep_offset:
+; CHECK:         .functype store_v4i32_with_folded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 1
   store <4 x i32> %v , <4 x i32>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4i32_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4i32_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i32>* %p) {
+; CHECK-LABEL: store_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype store_v4i32_with_unfolded_gep_negative_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 -1
   store <4 x i32> %v , <4 x i32>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4i32_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4i32_with_unfolded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) {
+; CHECK-LABEL: store_v4i32_with_unfolded_offset:
+; CHECK:         .functype store_v4i32_with_unfolded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 -1
   store <4 x i32> %v , <4 x i32>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4i32_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4i32_with_unfolded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
+; CHECK-LABEL: store_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype store_v4i32_with_unfolded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <4 x i32>, <4 x i32>* %p, i32 1
   store <4 x i32> %v , <4 x i32>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4i32_to_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4i32_to_numeric_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v4i32_to_numeric_address(<4 x i32> %v) {
+; CHECK-LABEL: store_v4i32_to_numeric_address:
+; CHECK:         .functype store_v4i32_to_numeric_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <4 x i32>*
   store <4 x i32> %v , <4 x i32>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4i32_to_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4i32_to_global_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v4i32($pop[[R]]), $0{{$}}
 define void @store_v4i32_to_global_address(<4 x i32> %v) {
+; CHECK-LABEL: store_v4i32_to_global_address:
+; CHECK:         .functype store_v4i32_to_global_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store gv_v4i32
+; CHECK-NEXT:    # fallthrough-return
   store <4 x i32> %v , <4 x i32>* @gv_v4i32
   ret void
 }
@@ -1546,66 +1685,72 @@ define void @store_v4i32_to_global_address(<4 x i32> %v) {
 ; ==============================================================================
 ; 2 x i64
 ; ==============================================================================
-; CHECK-LABEL: load_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2i64 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64(<2 x i64>* %p) {
+; CHECK-LABEL: load_v2i64:
+; CHECK:         .functype load_v2i64 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i64>, <2 x i64>* %p
   ret <2 x i64> %v
 }
 
-; CHECK-LABEL: load_splat_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2i64 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_splat_v2i64(i64* %p) {
+; CHECK-LABEL: load_splat_v2i64:
+; CHECK:         .functype load_splat_v2i64 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v64x2.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %e = load i64, i64* %p
   %v1 = insertelement <2 x i64> undef, i64 %e, i32 0
   %v2 = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_sext_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v2i64 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.load32x2_s $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_sext_v2i64(<2 x i32>* %p) {
+; CHECK-LABEL: load_sext_v2i64:
+; CHECK:         .functype load_sext_v2i64 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.load32x2_s 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i32>, <2 x i32>* %p
   %v2 = sext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_zext_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v2i64 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_zext_v2i64(<2 x i32>* %p) {
+; CHECK-LABEL: load_zext_v2i64:
+; CHECK:         .functype load_zext_v2i64 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i32>, <2 x i32>* %p
   %v2 = zext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_ext_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_ext_v2i64 (i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i32> @load_ext_v2i64(<2 x i32>* %p) {
+; CHECK-LABEL: load_ext_v2i64:
+; CHECK:         .functype load_ext_v2i64 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i32>, <2 x i32>* %p
   ret <2 x i32> %v
 }
 
-; CHECK-LABEL: load_v2i64_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2i64_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_folded_offset(<2 x i64>* %p) {
+; CHECK-LABEL: load_v2i64_with_folded_offset:
+; CHECK:         .functype load_v2i64_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i64>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <2 x i64>*
@@ -1613,12 +1758,13 @@ define <2 x i64> @load_v2i64_with_folded_offset(<2 x i64>* %p) {
   ret <2 x i64> %v
 }
 
-; CHECK-LABEL: load_splat_v2i64_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2i64_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_splat_v2i64_with_folded_offset(i64* %p) {
+; CHECK-LABEL: load_splat_v2i64_with_folded_offset:
+; CHECK:         .functype load_splat_v2i64_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v64x2.load_splat 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint i64* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to i64*
@@ -1628,12 +1774,13 @@ define <2 x i64> @load_splat_v2i64_with_folded_offset(i64* %p) {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_sext_v2i64_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v2i64_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.load32x2_s $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_sext_v2i64_with_folded_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_sext_v2i64_with_folded_offset:
+; CHECK:         .functype load_sext_v2i64_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.load32x2_s 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i32>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <2 x i32>*
@@ -1642,12 +1789,13 @@ define <2 x i64> @load_sext_v2i64_with_folded_offset(<2 x i32>* %p) {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_zext_v2i64_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v2i64_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_zext_v2i64_with_folded_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_zext_v2i64_with_folded_offset:
+; CHECK:         .functype load_zext_v2i64_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.load32x2_u 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i32>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <2 x i32>*
@@ -1656,12 +1804,13 @@ define <2 x i64> @load_zext_v2i64_with_folded_offset(<2 x i32>* %p) {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_ext_v2i64_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_ext_v2i64_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i32> @load_ext_v2i64_with_folded_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_ext_v2i64_with_folded_offset:
+; CHECK:         .functype load_ext_v2i64_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.load32x2_u 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i32>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <2 x i32>*
@@ -1669,23 +1818,25 @@ define <2 x i32> @load_ext_v2i64_with_folded_offset(<2 x i32>* %p) {
   ret <2 x i32> %v
 }
 
-; CHECK-LABEL: load_v2i64_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2i64_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_folded_gep_offset(<2 x i64>* %p) {
+; CHECK-LABEL: load_v2i64_with_folded_gep_offset:
+; CHECK:         .functype load_v2i64_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 1
   %v = load <2 x i64>, <2 x i64>* %s
   ret <2 x i64> %v
 }
 
-; CHECK-LABEL: load_splat_v2i64_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2i64_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_splat_v2i64_with_folded_gep_offset(i64* %p) {
+; CHECK-LABEL: load_splat_v2i64_with_folded_gep_offset:
+; CHECK:         .functype load_splat_v2i64_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v64x2.load_splat 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i64, i64* %p, i32 1
   %e = load i64, i64* %s
   %v1 = insertelement <2 x i64> undef, i64 %e, i32 0
@@ -1693,62 +1844,67 @@ define <2 x i64> @load_splat_v2i64_with_folded_gep_offset(i64* %p) {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_sext_v2i64_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v2i64_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.load32x2_s $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_sext_v2i64_with_folded_gep_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_sext_v2i64_with_folded_gep_offset:
+; CHECK:         .functype load_sext_v2i64_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.load32x2_s 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 1
   %v = load <2 x i32>, <2 x i32>* %s
   %v2 = sext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_zext_v2i64_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v2i64_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_zext_v2i64_with_folded_gep_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_zext_v2i64_with_folded_gep_offset:
+; CHECK:         .functype load_zext_v2i64_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.load32x2_u 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 1
   %v = load <2 x i32>, <2 x i32>* %s
   %v2 = zext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_ext_v2i64_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_ext_v2i64_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i32> @load_ext_v2i64_with_folded_gep_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_ext_v2i64_with_folded_gep_offset:
+; CHECK:         .functype load_ext_v2i64_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.load32x2_u 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 1
   %v = load <2 x i32>, <2 x i32>* %s
   ret <2 x i32> %v
 }
 
-; CHECK-LABEL: load_v2i64_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_unfolded_gep_negative_offset(<2 x i64>* %p) {
+; CHECK-LABEL: load_v2i64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 -1
   %v = load <2 x i64>, <2 x i64>* %s
   ret <2 x i64> %v
 }
 
-; CHECK-LABEL: load_splat_v2i64_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_splat_v2i64_with_unfolded_gep_negative_offset(i64* %p) {
+; CHECK-LABEL: load_splat_v2i64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_splat_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v64x2.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds i64, i64* %p, i32 -1
   %e = load i64, i64* %s
   %v1 = insertelement <2 x i64> undef, i64 %e, i32 0
@@ -1756,55 +1912,59 @@ define <2 x i64> @load_splat_v2i64_with_unfolded_gep_negative_offset(i64* %p) {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_sext_v2i64_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.load32x2_s $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_sext_v2i64_with_unfolded_gep_negative_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_sext_v2i64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_sext_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i64x2.load32x2_s 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 -1
   %v = load <2 x i32>, <2 x i32>* %s
   %v2 = sext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_zext_v2i64_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_zext_v2i64_with_unfolded_gep_negative_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_zext_v2i64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_zext_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 -1
   %v = load <2 x i32>, <2 x i32>* %s
   %v2 = zext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_ext_v2i64_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_ext_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i32> @load_ext_v2i64_with_unfolded_gep_negative_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_ext_v2i64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_ext_v2i64_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 -1
   %v = load <2 x i32>, <2 x i32>* %s
   ret <2 x i32> %v
 }
 
-; CHECK-LABEL: load_v2i64_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2i64_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_unfolded_offset(<2 x i64>* %p) {
+; CHECK-LABEL: load_v2i64_with_unfolded_offset:
+; CHECK:         .functype load_v2i64_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i64>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <2 x i64>*
@@ -1812,14 +1972,15 @@ define <2 x i64> @load_v2i64_with_unfolded_offset(<2 x i64>* %p) {
   ret <2 x i64> %v
 }
 
-; CHECK-LABEL: load_splat_v2i64_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2i64_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_splat_v2i64_with_unfolded_offset(i64* %p) {
+; CHECK-LABEL: load_splat_v2i64_with_unfolded_offset:
+; CHECK:         .functype load_splat_v2i64_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v64x2.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint i64* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to i64*
@@ -1829,14 +1990,15 @@ define <2 x i64> @load_splat_v2i64_with_unfolded_offset(i64* %p) {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_sext_v2i64_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v2i64_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.load32x2_s $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_sext_v2i64_with_unfolded_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_sext_v2i64_with_unfolded_offset:
+; CHECK:         .functype load_sext_v2i64_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i64x2.load32x2_s 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i32>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <2 x i32>*
@@ -1845,14 +2007,15 @@ define <2 x i64> @load_sext_v2i64_with_unfolded_offset(<2 x i32>* %p) {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_zext_v2i64_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v2i64_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_zext_v2i64_with_unfolded_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_zext_v2i64_with_unfolded_offset:
+; CHECK:         .functype load_zext_v2i64_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i32>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <2 x i32>*
@@ -1861,14 +2024,15 @@ define <2 x i64> @load_zext_v2i64_with_unfolded_offset(<2 x i32>* %p) {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_ext_v2i64_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_ext_v2i64_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i32> @load_ext_v2i64_with_unfolded_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_ext_v2i64_with_unfolded_offset:
+; CHECK:         .functype load_ext_v2i64_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i32>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <2 x i32>*
@@ -1876,27 +2040,29 @@ define <2 x i32> @load_ext_v2i64_with_unfolded_offset(<2 x i32>* %p) {
   ret <2 x i32> %v
 }
 
-; CHECK-LABEL: load_v2i64_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2i64_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_unfolded_gep_offset(<2 x i64>* %p) {
+; CHECK-LABEL: load_v2i64_with_unfolded_gep_offset:
+; CHECK:         .functype load_v2i64_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <2 x i64>, <2 x i64>* %p, i32 1
   %v = load <2 x i64>, <2 x i64>* %s
   ret <2 x i64> %v
 }
 
-; CHECK-LABEL: load_splat_v2i64_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2i64_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_splat_v2i64_with_unfolded_gep_offset(i64* %p) {
+; CHECK-LABEL: load_splat_v2i64_with_unfolded_gep_offset:
+; CHECK:         .functype load_splat_v2i64_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v64x2.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr i64, i64* %p, i32 1
   %e = load i64, i64* %s
   %v1 = insertelement <2 x i64> undef, i64 %e, i32 0
@@ -1904,66 +2070,69 @@ define <2 x i64> @load_splat_v2i64_with_unfolded_gep_offset(i64* %p) {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_sext_v2i64_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v2i64_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.load32x2_s $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_sext_v2i64_with_unfolded_gep_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_sext_v2i64_with_unfolded_gep_offset:
+; CHECK:         .functype load_sext_v2i64_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i64x2.load32x2_s 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <2 x i32>, <2 x i32>* %p, i32 1
   %v = load <2 x i32>, <2 x i32>* %s
   %v2 = sext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_zext_v2i64_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v2i64_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_zext_v2i64_with_unfolded_gep_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_zext_v2i64_with_unfolded_gep_offset:
+; CHECK:         .functype load_zext_v2i64_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <2 x i32>, <2 x i32>* %p, i32 1
   %v = load <2 x i32>, <2 x i32>* %s
   %v2 = zext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_ext_v2i64_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_ext_v2i64_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i32> @load_ext_v2i64_with_unfolded_gep_offset(<2 x i32>* %p) {
+; CHECK-LABEL: load_ext_v2i64_with_unfolded_gep_offset:
+; CHECK:         .functype load_ext_v2i64_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <2 x i32>, <2 x i32>* %p, i32 1
   %v = load <2 x i32>, <2 x i32>* %s
   ret <2 x i32> %v
 }
 
-; CHECK-LABEL: load_v2i64_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2i64_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_from_numeric_address() {
+; CHECK-LABEL: load_v2i64_from_numeric_address:
+; CHECK:         .functype load_v2i64_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <2 x i64>*
   %v = load <2 x i64>, <2 x i64>* %s
   ret <2 x i64> %v
 }
 
-; CHECK-LABEL: load_splat_v2i64_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2i64_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_splat_v2i64_from_numeric_address() {
+; CHECK-LABEL: load_splat_v2i64_from_numeric_address:
+; CHECK:         .functype load_splat_v2i64_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v64x2.load_splat 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to i64*
   %e = load i64, i64* %s
   %v1 = insertelement <2 x i64> undef, i64 %e, i32 0
@@ -1971,120 +2140,126 @@ define <2 x i64> @load_splat_v2i64_from_numeric_address() {
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_sext_v2i64_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v2i64_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i64x2.load32x2_s $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_sext_v2i64_from_numeric_address() {
+; CHECK-LABEL: load_sext_v2i64_from_numeric_address:
+; CHECK:         .functype load_sext_v2i64_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i64x2.load32x2_s 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <2 x i32>*
   %v = load <2 x i32>, <2 x i32>* %s
   %v2 = sext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_zext_v2i64_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v2i64_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_zext_v2i64_from_numeric_address() {
+; CHECK-LABEL: load_zext_v2i64_from_numeric_address:
+; CHECK:         .functype load_zext_v2i64_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i64x2.load32x2_u 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <2 x i32>*
   %v = load <2 x i32>, <2 x i32>* %s
   %v2 = zext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_ext_v2i64_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_ext_v2i64_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i32> @load_ext_v2i64_from_numeric_address() {
+; CHECK-LABEL: load_ext_v2i64_from_numeric_address:
+; CHECK:         .functype load_ext_v2i64_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i64x2.load32x2_u 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <2 x i32>*
   %v = load <2 x i32>, <2 x i32>* %s
   ret <2 x i32> %v
 }
 
-; CHECK-LABEL: load_v2i64_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2i64_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2i64($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v2i64 = global <2 x i64> <i64 42, i64 42>
 define <2 x i64> @load_v2i64_from_global_address() {
+; CHECK-LABEL: load_v2i64_from_global_address:
+; CHECK:         .functype load_v2i64_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load gv_v2i64
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i64>, <2 x i64>* @gv_v2i64
   ret <2 x i64> %v
 }
 
-; CHECK-LABEL: load_splat_v2i64_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2i64_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, gv_i64($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_i64 = global i64 42
 define <2 x i64> @load_splat_v2i64_from_global_address() {
+; CHECK-LABEL: load_splat_v2i64_from_global_address:
+; CHECK:         .functype load_splat_v2i64_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v64x2.load_splat gv_i64
+; CHECK-NEXT:    # fallthrough-return
   %e = load i64, i64* @gv_i64
   %v1 = insertelement <2 x i64> undef, i64 %e, i32 0
   %v2 = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_sext_v2i64_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_sext_v2i64_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i64x2.load32x2_s $push[[R:[0-9]+]]=, gv_v2i32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v2i32 = global <2 x i32> <i32 42, i32 42>
 define <2 x i64> @load_sext_v2i64_from_global_address() {
+; CHECK-LABEL: load_sext_v2i64_from_global_address:
+; CHECK:         .functype load_sext_v2i64_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i64x2.load32x2_s gv_v2i32
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i32>, <2 x i32>* @gv_v2i32
   %v2 = sext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_zext_v2i64_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_zext_v2i64_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, gv_v2i32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_zext_v2i64_from_global_address() {
+; CHECK-LABEL: load_zext_v2i64_from_global_address:
+; CHECK:         .functype load_zext_v2i64_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i64x2.load32x2_u gv_v2i32
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i32>, <2 x i32>* @gv_v2i32
   %v2 = zext <2 x i32> %v to <2 x i64>
   ret <2 x i64> %v2
 }
 
-; CHECK-LABEL: load_ext_v2i64_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_ext_v2i64_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: i64x2.load32x2_u $push[[R:[0-9]+]]=, gv_v2i32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i32> @load_ext_v2i64_from_global_address() {
+; CHECK-LABEL: load_ext_v2i64_from_global_address:
+; CHECK:         .functype load_ext_v2i64_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i64x2.load32x2_u gv_v2i32
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i32>, <2 x i32>* @gv_v2i32
   ret <2 x i32> %v
 }
 
-; CHECK-LABEL: store_v2i64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2i64 (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v2i64(<2 x i64> %v, <2 x i64>* %p) {
+; CHECK-LABEL: store_v2i64:
+; CHECK:         .functype store_v2i64 (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   store <2 x i64> %v , <2 x i64>* %p
   ret void
 }
 
-; CHECK-LABEL: store_v2i64_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2i64_with_folded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2i64_with_folded_offset(<2 x i64> %v, <2 x i64>* %p) {
+; CHECK-LABEL: store_v2i64_with_folded_offset:
+; CHECK:         .functype store_v2i64_with_folded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i64>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <2 x i64>*
@@ -2092,69 +2267,85 @@ define void @store_v2i64_with_folded_offset(<2 x i64> %v, <2 x i64>* %p) {
   ret void
 }
 
-; CHECK-LABEL: store_v2i64_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2i64_with_folded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2i64_with_folded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
+; CHECK-LABEL: store_v2i64_with_folded_gep_offset:
+; CHECK:         .functype store_v2i64_with_folded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 1
   store <2 x i64> %v , <2 x i64>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2i64_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2i64_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2i64_with_unfolded_gep_negative_offset(<2 x i64> %v, <2 x i64>* %p) {
+; CHECK-LABEL: store_v2i64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype store_v2i64_with_unfolded_gep_negative_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 -1
   store <2 x i64> %v , <2 x i64>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2i64_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2i64_with_unfolded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2i64_with_unfolded_offset(<2 x i64> %v, <2 x i64>* %p) {
+; CHECK-LABEL: store_v2i64_with_unfolded_offset:
+; CHECK:         .functype store_v2i64_with_unfolded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 -1
   store <2 x i64> %v , <2 x i64>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2i64_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2i64_with_unfolded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2i64_with_unfolded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
+; CHECK-LABEL: store_v2i64_with_unfolded_gep_offset:
+; CHECK:         .functype store_v2i64_with_unfolded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <2 x i64>, <2 x i64>* %p, i32 1
   store <2 x i64> %v , <2 x i64>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2i64_to_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2i64_to_numeric_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v2i64_to_numeric_address(<2 x i64> %v) {
+; CHECK-LABEL: store_v2i64_to_numeric_address:
+; CHECK:         .functype store_v2i64_to_numeric_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <2 x i64>*
   store <2 x i64> %v , <2 x i64>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2i64_to_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2i64_to_global_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v2i64($pop[[R]]), $0{{$}}
 define void @store_v2i64_to_global_address(<2 x i64> %v) {
+; CHECK-LABEL: store_v2i64_to_global_address:
+; CHECK:         .functype store_v2i64_to_global_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store gv_v2i64
+; CHECK-NEXT:    # fallthrough-return
   store <2 x i64> %v , <2 x i64>* @gv_v2i64
   ret void
 }
@@ -2162,34 +2353,37 @@ define void @store_v2i64_to_global_address(<2 x i64> %v) {
 ; ==============================================================================
 ; 4 x float
 ; ==============================================================================
-; CHECK-LABEL: load_v4f32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4f32 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32(<4 x float>* %p) {
+; CHECK-LABEL: load_v4f32:
+; CHECK:         .functype load_v4f32 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x float>, <4 x float>* %p
   ret <4 x float> %v
 }
 
-; CHECK-LABEL: load_splat_v4f32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4f32 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_splat_v4f32(float* %p) {
+; CHECK-LABEL: load_splat_v4f32:
+; CHECK:         .functype load_splat_v4f32 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v32x4.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %e = load float, float* %p
   %v1 = insertelement <4 x float> undef, float %e, i32 0
   %v2 = shufflevector <4 x float> %v1, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %v2
 }
 
-; CHECK-LABEL: load_v4f32_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4f32_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_folded_offset(<4 x float>* %p) {
+; CHECK-LABEL: load_v4f32_with_folded_offset:
+; CHECK:         .functype load_v4f32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x float>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <4 x float>*
@@ -2197,12 +2391,13 @@ define <4 x float> @load_v4f32_with_folded_offset(<4 x float>* %p) {
   ret <4 x float> %v
 }
 
-; CHECK-LABEL: load_splat_v4f32_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4f32_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_splat_v4f32_with_folded_offset(float* %p) {
+; CHECK-LABEL: load_splat_v4f32_with_folded_offset:
+; CHECK:         .functype load_splat_v4f32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v32x4.load_splat 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint float* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to float*
@@ -2212,23 +2407,25 @@ define <4 x float> @load_splat_v4f32_with_folded_offset(float* %p) {
   ret <4 x float> %v2
 }
 
-; CHECK-LABEL: load_v4f32_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4f32_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_folded_gep_offset(<4 x float>* %p) {
+; CHECK-LABEL: load_v4f32_with_folded_gep_offset:
+; CHECK:         .functype load_v4f32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 1
   %v = load <4 x float>, <4 x float>* %s
   ret <4 x float> %v
 }
 
-; CHECK-LABEL: load_splat_v4f32_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4f32_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 4($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_splat_v4f32_with_folded_gep_offset(float* %p) {
+; CHECK-LABEL: load_splat_v4f32_with_folded_gep_offset:
+; CHECK:         .functype load_splat_v4f32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v32x4.load_splat 4
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds float, float* %p, i32 1
   %e = load float, float* %s
   %v1 = insertelement <4 x float> undef, float %e, i32 0
@@ -2236,27 +2433,29 @@ define <4 x float> @load_splat_v4f32_with_folded_gep_offset(float* %p) {
   ret <4 x float> %v2
 }
 
-; CHECK-LABEL: load_v4f32_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4f32_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_unfolded_gep_negative_offset(<4 x float>* %p) {
+; CHECK-LABEL: load_v4f32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_v4f32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1
   %v = load <4 x float>, <4 x float>* %s
   ret <4 x float> %v
 }
 
-; CHECK-LABEL: load_splat_v4f32_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4f32_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -4{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_splat_v4f32_with_unfolded_gep_negative_offset(float* %p) {
+; CHECK-LABEL: load_splat_v4f32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_splat_v4f32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -4
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v32x4.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds float, float* %p, i32 -1
   %e = load float, float* %s
   %v1 = insertelement <4 x float> undef, float %e, i32 0
@@ -2264,14 +2463,15 @@ define <4 x float> @load_splat_v4f32_with_unfolded_gep_negative_offset(float* %p
   ret <4 x float> %v2
 }
 
-; CHECK-LABEL: load_v4f32_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4f32_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_unfolded_offset(<4 x float>* %p) {
+; CHECK-LABEL: load_v4f32_with_unfolded_offset:
+; CHECK:         .functype load_v4f32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x float>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <4 x float>*
@@ -2279,14 +2479,15 @@ define <4 x float> @load_v4f32_with_unfolded_offset(<4 x float>* %p) {
   ret <4 x float> %v
 }
 
-; CHECK-LABEL: load_splat_v4f32_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4f32_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_splat_v4f32_with_unfolded_offset(float* %p) {
+; CHECK-LABEL: load_splat_v4f32_with_unfolded_offset:
+; CHECK:         .functype load_splat_v4f32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v32x4.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint float* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to float*
@@ -2296,27 +2497,29 @@ define <4 x float> @load_splat_v4f32_with_unfolded_offset(float* %p) {
   ret <4 x float> %v2
 }
 
-; CHECK-LABEL: load_v4f32_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4f32_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_unfolded_gep_offset(<4 x float>* %p) {
+; CHECK-LABEL: load_v4f32_with_unfolded_gep_offset:
+; CHECK:         .functype load_v4f32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <4 x float>, <4 x float>* %p, i32 1
   %v = load <4 x float>, <4 x float>* %s
   ret <4 x float> %v
 }
 
-; CHECK-LABEL: load_splat_v4f32_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4f32_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 4{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_splat_v4f32_with_unfolded_gep_offset(float* %p) {
+; CHECK-LABEL: load_splat_v4f32_with_unfolded_gep_offset:
+; CHECK:         .functype load_splat_v4f32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 4
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v32x4.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr float, float* %p, i32 1
   %e = load float, float* %s
   %v1 = insertelement <4 x float> undef, float %e, i32 0
@@ -2324,25 +2527,25 @@ define <4 x float> @load_splat_v4f32_with_unfolded_gep_offset(float* %p) {
   ret <4 x float> %v2
 }
 
-; CHECK-LABEL: load_v4f32_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4f32_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_from_numeric_address() {
+; CHECK-LABEL: load_v4f32_from_numeric_address:
+; CHECK:         .functype load_v4f32_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <4 x float>*
   %v = load <4 x float>, <4 x float>* %s
   ret <4 x float> %v
 }
 
-; CHECK-LABEL: load_splat_v4f32_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4f32_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_splat_v4f32_from_numeric_address() {
+; CHECK-LABEL: load_splat_v4f32_from_numeric_address:
+; CHECK:         .functype load_splat_v4f32_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v32x4.load_splat 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to float*
   %e = load float, float* %s
   %v1 = insertelement <4 x float> undef, float %e, i32 0
@@ -2350,46 +2553,52 @@ define <4 x float> @load_splat_v4f32_from_numeric_address() {
   ret <4 x float> %v2
 }
 
-; CHECK-LABEL: load_v4f32_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v4f32_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4f32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v4f32 = global <4 x float> <float 42., float 42., float 42., float 42.>
 define <4 x float> @load_v4f32_from_global_address() {
+; CHECK-LABEL: load_v4f32_from_global_address:
+; CHECK:         .functype load_v4f32_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load gv_v4f32
+; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x float>, <4 x float>* @gv_v4f32
   ret <4 x float> %v
 }
 
-; CHECK-LABEL: load_splat_v4f32_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v4f32_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v32x4.load_splat $push[[R:[0-9]+]]=, gv_f32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_f32 = global float 42.
 define <4 x float> @load_splat_v4f32_from_global_address() {
+; CHECK-LABEL: load_splat_v4f32_from_global_address:
+; CHECK:         .functype load_splat_v4f32_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v32x4.load_splat gv_f32
+; CHECK-NEXT:    # fallthrough-return
   %e = load float, float* @gv_f32
   %v1 = insertelement <4 x float> undef, float %e, i32 0
   %v2 = shufflevector <4 x float> %v1, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %v2
 }
 
-; CHECK-LABEL: store_v4f32:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4f32 (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v4f32(<4 x float> %v, <4 x float>* %p) {
+; CHECK-LABEL: store_v4f32:
+; CHECK:         .functype store_v4f32 (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   store <4 x float> %v , <4 x float>* %p
   ret void
 }
 
-; CHECK-LABEL: store_v4f32_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4f32_with_folded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4f32_with_folded_offset(<4 x float> %v, <4 x float>* %p) {
+; CHECK-LABEL: store_v4f32_with_folded_offset:
+; CHECK:         .functype store_v4f32_with_folded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x float>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <4 x float>*
@@ -2397,69 +2606,85 @@ define void @store_v4f32_with_folded_offset(<4 x float> %v, <4 x float>* %p) {
   ret void
 }
 
-; CHECK-LABEL: store_v4f32_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4f32_with_folded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4f32_with_folded_gep_offset(<4 x float> %v, <4 x float>* %p) {
+; CHECK-LABEL: store_v4f32_with_folded_gep_offset:
+; CHECK:         .functype store_v4f32_with_folded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 1
   store <4 x float> %v , <4 x float>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4f32_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4f32_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4f32_with_unfolded_gep_negative_offset(<4 x float> %v, <4 x float>* %p) {
+; CHECK-LABEL: store_v4f32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype store_v4f32_with_unfolded_gep_negative_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1
   store <4 x float> %v , <4 x float>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4f32_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4f32_with_unfolded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4f32_with_unfolded_offset(<4 x float> %v, <4 x float>* %p) {
+; CHECK-LABEL: store_v4f32_with_unfolded_offset:
+; CHECK:         .functype store_v4f32_with_unfolded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1
   store <4 x float> %v , <4 x float>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4f32_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4f32_with_unfolded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4f32_with_unfolded_gep_offset(<4 x float> %v, <4 x float>* %p) {
+; CHECK-LABEL: store_v4f32_with_unfolded_gep_offset:
+; CHECK:         .functype store_v4f32_with_unfolded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <4 x float>, <4 x float>* %p, i32 1
   store <4 x float> %v , <4 x float>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4f32_to_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4f32_to_numeric_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v4f32_to_numeric_address(<4 x float> %v) {
+; CHECK-LABEL: store_v4f32_to_numeric_address:
+; CHECK:         .functype store_v4f32_to_numeric_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <4 x float>*
   store <4 x float> %v , <4 x float>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v4f32_to_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v4f32_to_global_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v4f32($pop[[R]]), $0{{$}}
 define void @store_v4f32_to_global_address(<4 x float> %v) {
+; CHECK-LABEL: store_v4f32_to_global_address:
+; CHECK:         .functype store_v4f32_to_global_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store gv_v4f32
+; CHECK-NEXT:    # fallthrough-return
   store <4 x float> %v , <4 x float>* @gv_v4f32
   ret void
 }
@@ -2467,34 +2692,37 @@ define void @store_v4f32_to_global_address(<4 x float> %v) {
 ; ==============================================================================
 ; 2 x double
 ; ==============================================================================
-; CHECK-LABEL: load_v2f64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2f64 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64(<2 x double>* %p) {
+; CHECK-LABEL: load_v2f64:
+; CHECK:         .functype load_v2f64 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x double>, <2 x double>* %p
   ret <2 x double> %v
 }
 
-; CHECK-LABEL: load_splat_v2f64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2f64 (i32) -> (v128){{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 0($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_splat_v2f64(double* %p) {
+; CHECK-LABEL: load_splat_v2f64:
+; CHECK:         .functype load_splat_v2f64 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v64x2.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %e = load double, double* %p
   %v1 = insertelement <2 x double> undef, double %e, i32 0
   %v2 = shufflevector <2 x double> %v1, <2 x double> undef, <2 x i32> zeroinitializer
   ret <2 x double> %v2
 }
 
-; CHECK-LABEL: load_v2f64_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2f64_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_folded_offset(<2 x double>* %p) {
+; CHECK-LABEL: load_v2f64_with_folded_offset:
+; CHECK:         .functype load_v2f64_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x double>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <2 x double>*
@@ -2502,12 +2730,13 @@ define <2 x double> @load_v2f64_with_folded_offset(<2 x double>* %p) {
   ret <2 x double> %v
 }
 
-; CHECK-LABEL: load_splat_v2f64_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2f64_with_folded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_splat_v2f64_with_folded_offset(double* %p) {
+; CHECK-LABEL: load_splat_v2f64_with_folded_offset:
+; CHECK:         .functype load_splat_v2f64_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v64x2.load_splat 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint double* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to double*
@@ -2517,23 +2746,25 @@ define <2 x double> @load_splat_v2f64_with_folded_offset(double* %p) {
   ret <2 x double> %v2
 }
 
-; CHECK-LABEL: load_v2f64_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2f64_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_folded_gep_offset(<2 x double>* %p) {
+; CHECK-LABEL: load_v2f64_with_folded_gep_offset:
+; CHECK:         .functype load_v2f64_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 1
   %v = load <2 x double>, <2 x double>* %s
   ret <2 x double> %v
 }
 
-; CHECK-LABEL: load_splat_v2f64_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2f64_with_folded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 8($0){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_splat_v2f64_with_folded_gep_offset(double* %p) {
+; CHECK-LABEL: load_splat_v2f64_with_folded_gep_offset:
+; CHECK:         .functype load_splat_v2f64_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v64x2.load_splat 8
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds double, double* %p, i32 1
   %e = load double, double* %s
   %v1 = insertelement <2 x double> undef, double %e, i32 0
@@ -2541,27 +2772,29 @@ define <2 x double> @load_splat_v2f64_with_folded_gep_offset(double* %p) {
   ret <2 x double> %v2
 }
 
-; CHECK-LABEL: load_v2f64_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2f64_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_unfolded_gep_negative_offset(<2 x double>* %p) {
+; CHECK-LABEL: load_v2f64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_v2f64_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 -1
   %v = load <2 x double>, <2 x double>* %s
   ret <2 x double> %v
 }
 
-; CHECK-LABEL: load_splat_v2f64_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2f64_with_unfolded_gep_negative_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_splat_v2f64_with_unfolded_gep_negative_offset(double* %p) {
+; CHECK-LABEL: load_splat_v2f64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_splat_v2f64_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v64x2.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds double, double* %p, i32 -1
   %e = load double, double* %s
   %v1 = insertelement <2 x double> undef, double %e, i32 0
@@ -2569,14 +2802,15 @@ define <2 x double> @load_splat_v2f64_with_unfolded_gep_negative_offset(double*
   ret <2 x double> %v2
 }
 
-; CHECK-LABEL: load_v2f64_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2f64_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_unfolded_offset(<2 x double>* %p) {
+; CHECK-LABEL: load_v2f64_with_unfolded_offset:
+; CHECK:         .functype load_v2f64_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x double>* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to <2 x double>*
@@ -2584,14 +2818,15 @@ define <2 x double> @load_v2f64_with_unfolded_offset(<2 x double>* %p) {
   ret <2 x double> %v
 }
 
-; CHECK-LABEL: load_splat_v2f64_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2f64_with_unfolded_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_splat_v2f64_with_unfolded_offset(double* %p) {
+; CHECK-LABEL: load_splat_v2f64_with_unfolded_offset:
+; CHECK:         .functype load_splat_v2f64_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v64x2.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint double* %p to i32
   %r = add nsw i32 %q, 16
   %s = inttoptr i32 %r to double*
@@ -2601,27 +2836,29 @@ define <2 x double> @load_splat_v2f64_with_unfolded_offset(double* %p) {
   ret <2 x double> %v2
 }
 
-; CHECK-LABEL: load_v2f64_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2f64_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_unfolded_gep_offset(<2 x double>* %p) {
+; CHECK-LABEL: load_v2f64_with_unfolded_gep_offset:
+; CHECK:         .functype load_v2f64_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <2 x double>, <2 x double>* %p, i32 1
   %v = load <2 x double>, <2 x double>* %s
   ret <2 x double> %v
 }
 
-; CHECK-LABEL: load_splat_v2f64_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2f64_with_unfolded_gep_offset (i32) -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 8{{$}}
-; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_splat_v2f64_with_unfolded_gep_offset(double* %p) {
+; CHECK-LABEL: load_splat_v2f64_with_unfolded_gep_offset:
+; CHECK:         .functype load_splat_v2f64_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v64x2.load_splat 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr double, double* %p, i32 1
   %e = load double, double* %s
   %v1 = insertelement <2 x double> undef, double %e, i32 0
@@ -2629,25 +2866,25 @@ define <2 x double> @load_splat_v2f64_with_unfolded_gep_offset(double* %p) {
   ret <2 x double> %v2
 }
 
-; CHECK-LABEL: load_v2f64_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2f64_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_from_numeric_address() {
+; CHECK-LABEL: load_v2f64_from_numeric_address:
+; CHECK:         .functype load_v2f64_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <2 x double>*
   %v = load <2 x double>, <2 x double>* %s
   ret <2 x double> %v
 }
 
-; CHECK-LABEL: load_splat_v2f64_from_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2f64_from_numeric_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_splat_v2f64_from_numeric_address() {
+; CHECK-LABEL: load_splat_v2f64_from_numeric_address:
+; CHECK:         .functype load_splat_v2f64_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v64x2.load_splat 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to double*
   %e = load double, double* %s
   %v1 = insertelement <2 x double> undef, double %e, i32 0
@@ -2655,46 +2892,52 @@ define <2 x double> @load_splat_v2f64_from_numeric_address() {
   ret <2 x double> %v2
 }
 
-; CHECK-LABEL: load_v2f64_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_v2f64_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2f64($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v2f64 = global <2 x double> <double 42., double 42.>
 define <2 x double> @load_v2f64_from_global_address() {
+; CHECK-LABEL: load_v2f64_from_global_address:
+; CHECK:         .functype load_v2f64_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load gv_v2f64
+; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x double>, <2 x double>* @gv_v2f64
   ret <2 x double> %v
 }
 
-; CHECK-LABEL: load_splat_v2f64_from_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype load_splat_v2f64_from_global_address () -> (v128){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v64x2.load_splat $push[[R:[0-9]+]]=, gv_f64($pop[[L0]]){{$}}
-; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_f64 = global double 42.
 define <2 x double> @load_splat_v2f64_from_global_address() {
+; CHECK-LABEL: load_splat_v2f64_from_global_address:
+; CHECK:         .functype load_splat_v2f64_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v64x2.load_splat gv_f64
+; CHECK-NEXT:    # fallthrough-return
   %e = load double, double* @gv_f64
   %v1 = insertelement <2 x double> undef, double %e, i32 0
   %v2 = shufflevector <2 x double> %v1, <2 x double> undef, <2 x i32> zeroinitializer
   ret <2 x double> %v2
 }
 
-; CHECK-LABEL: store_v2f64:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2f64 (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v2f64(<2 x double> %v, <2 x double>* %p) {
+; CHECK-LABEL: store_v2f64:
+; CHECK:         .functype store_v2f64 (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   store <2 x double> %v , <2 x double>* %p
   ret void
 }
 
-; CHECK-LABEL: store_v2f64_with_folded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2f64_with_folded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2f64_with_folded_offset(<2 x double> %v, <2 x double>* %p) {
+; CHECK-LABEL: store_v2f64_with_folded_offset:
+; CHECK:         .functype store_v2f64_with_folded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x double>* %p to i32
   %r = add nuw i32 %q, 16
   %s = inttoptr i32 %r to <2 x double>*
@@ -2702,69 +2945,85 @@ define void @store_v2f64_with_folded_offset(<2 x double> %v, <2 x double>* %p) {
   ret void
 }
 
-; CHECK-LABEL: store_v2f64_with_folded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2f64_with_folded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2f64_with_folded_gep_offset(<2 x double> %v, <2 x double>* %p) {
+; CHECK-LABEL: store_v2f64_with_folded_gep_offset:
+; CHECK:         .functype store_v2f64_with_folded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 1
   store <2 x double> %v , <2 x double>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2f64_with_unfolded_gep_negative_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2f64_with_unfolded_gep_negative_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2f64_with_unfolded_gep_negative_offset(<2 x double> %v, <2 x double>* %p) {
+; CHECK-LABEL: store_v2f64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype store_v2f64_with_unfolded_gep_negative_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 -1
   store <2 x double> %v , <2 x double>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2f64_with_unfolded_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2f64_with_unfolded_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2f64_with_unfolded_offset(<2 x double> %v, <2 x double>* %p) {
+; CHECK-LABEL: store_v2f64_with_unfolded_offset:
+; CHECK:         .functype store_v2f64_with_unfolded_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const -16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 -1
   store <2 x double> %v , <2 x double>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2f64_with_unfolded_gep_offset:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2f64_with_unfolded_gep_offset (v128, i32) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
-; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2f64_with_unfolded_gep_offset(<2 x double> %v, <2 x double>* %p) {
+; CHECK-LABEL: store_v2f64_with_unfolded_gep_offset:
+; CHECK:         .functype store_v2f64_with_unfolded_gep_offset (v128, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <2 x double>, <2 x double>* %p, i32 1
   store <2 x double> %v , <2 x double>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2f64_to_numeric_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2f64_to_numeric_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v2f64_to_numeric_address(<2 x double> %v) {
+; CHECK-LABEL: store_v2f64_to_numeric_address:
+; CHECK:         .functype store_v2f64_to_numeric_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store 32
+; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <2 x double>*
   store <2 x double> %v , <2 x double>* %s
   ret void
 }
 
-; CHECK-LABEL: store_v2f64_to_global_address:
-; NO-SIMD128-NOT: v128
-; SIMD128-NEXT: .functype store_v2f64_to_global_address (v128) -> (){{$}}
-; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v2f64($pop[[R]]), $0{{$}}
 define void @store_v2f64_to_global_address(<2 x double> %v) {
+; CHECK-LABEL: store_v2f64_to_global_address:
+; CHECK:         .functype store_v2f64_to_global_address (v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.store gv_v2f64
+; CHECK-NEXT:    # fallthrough-return
   store <2 x double> %v , <2 x double>* @gv_v2f64
   ret void
 }


        


More information about the llvm-commits mailing list