[llvm] [SelectionDAG] Use the nuw flag when expanding loads. (PR #119288)

Dan Gohman via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 9 23:09:13 PST 2024


https://github.com/sunfishcode updated https://github.com/llvm/llvm-project/pull/119288

>From ff2a597f8d6330fff9298f509948bd44560e56bc Mon Sep 17 00:00:00 2001
From: Dan Gohman <dev at sunfishcode.online>
Date: Mon, 9 Dec 2024 15:01:20 -0800
Subject: [PATCH 1/2] [SelectionDAG] Use the nuw flag when expanding loads.

When expanding a load into two loads, use nuw for the add that computes
the offset from the base of the second load, because the original load
doesn't straddle the address space.

It turns out there's already a dedicated helper function for doing this,
`getObjectPtrOffset`.

This is in target-independent code, however in practice it only seems
to affact WebAssembly code, because WebAssembly load and store
instructions' constant offsets don't perform wrapping, so constant
folding often depends on the nuw flag being present.

This was noticed in the development of #119204.
---
 .../SelectionDAG/LegalizeTypesGeneric.cpp     |   2 +-
 llvm/test/CodeGen/WebAssembly/fpclamptosat.ll |  60 +-
 .../CodeGen/WebAssembly/fpclamptosat_vec.ll   | 180 +----
 llvm/test/CodeGen/WebAssembly/i128.ll         | 660 ++++++++----------
 .../test/CodeGen/WebAssembly/libcalls-trig.ll | 228 +++---
 llvm/test/CodeGen/WebAssembly/libcalls.ll     | 310 ++++----
 llvm/test/CodeGen/WebAssembly/multi-return.ll | 290 +++++---
 .../CodeGen/WebAssembly/multivalue_libcall.ll |  12 +-
 .../umulo-128-legalisation-lowering.ll        | 124 ++--
 9 files changed, 846 insertions(+), 1020 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 2655e8428309da..113a3bc0bbea69 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -265,7 +265,7 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
-  Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(IncrementSize), dl);
+  Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
   Hi = DAG.getLoad(
       NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize),
       LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), AAInfo);
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll
index 58e3f0dc2a93c0..137994ceac1322 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll
@@ -524,9 +524,7 @@ define i64 @utest_f64i64(double %x) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixunsdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -563,9 +561,7 @@ define i64 @utest_f64i64_cse_combine(double %x) #0 {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixunsdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -602,9 +598,7 @@ define i64 @ustest_f64i64(double %x) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -661,9 +655,7 @@ define i64 @ustest_f64i64_cse_combine(double %x) #0 {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -727,9 +719,7 @@ define i64 @utest_f32i64(float %x) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixunssfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -781,9 +771,7 @@ define i64 @ustest_f32i64(float %x) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -840,9 +828,7 @@ define i64 @ustest_f32i64_cse_combine(float %x) #0 {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -910,9 +896,7 @@ define i64 @utesth_f16i64(half %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixunssfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -981,9 +965,7 @@ define i64 @ustest_f16i64(half %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -1471,9 +1453,7 @@ define i64 @utest_f64i64_mm(double %x) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixunsdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -1509,9 +1489,7 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -1573,9 +1551,7 @@ define i64 @utest_f32i64_mm(float %x) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixunssfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -1611,9 +1587,7 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -1679,9 +1653,7 @@ define i64 @utesth_f16i64_mm(half %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixunssfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -1719,9 +1691,7 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
index 8f85575c1cf431..1feb5feb7a9ee8 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
@@ -685,19 +685,13 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    f64x2.extract_lane 0
 ; CHECK-NEXT:    call __fixdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -800,19 +794,13 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    f64x2.extract_lane 0
 ; CHECK-NEXT:    call __fixunsdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -863,19 +851,13 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    f64x2.extract_lane 0
 ; CHECK-NEXT:    call __fixdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -964,19 +946,13 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-NEXT:    f32x4.extract_lane 0
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -1079,19 +1055,13 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-NEXT:    f32x4.extract_lane 0
 ; CHECK-NEXT:    call __fixunssfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -1142,19 +1112,13 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-NEXT:    f32x4.extract_lane 0
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -1245,19 +1209,13 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 5
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 0
@@ -1362,19 +1320,13 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixunssfti
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 5
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 0
@@ -1427,19 +1379,13 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 5
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 0
@@ -2163,19 +2109,13 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NEXT:    f64x2.extract_lane 0
 ; CHECK-NEXT:    call __fixdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -2276,19 +2216,13 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NEXT:    f64x2.extract_lane 0
 ; CHECK-NEXT:    call __fixunsdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -2338,19 +2272,13 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NEXT:    f64x2.extract_lane 0
 ; CHECK-NEXT:    call __fixdfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -2421,19 +2349,13 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NEXT:    f32x4.extract_lane 0
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -2534,19 +2456,13 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NEXT:    f32x4.extract_lane 0
 ; CHECK-NEXT:    call __fixunssfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -2596,19 +2512,13 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NEXT:    f32x4.extract_lane 0
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 2
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64.load 0
@@ -2681,19 +2591,13 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 5
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 0
@@ -2796,19 +2700,13 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixunssfti
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 5
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 0
@@ -2860,19 +2758,13 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    call __fixsfti
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 16
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 24
 ; CHECK-NEXT:    local.set 3
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 16
 ; CHECK-NEXT:    local.set 4
 ; CHECK-NEXT:    local.get 2
-; CHECK-NEXT:    i32.const 8
-; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64.load 0
+; CHECK-NEXT:    i64.load 8
 ; CHECK-NEXT:    local.set 5
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i64.load 0
diff --git a/llvm/test/CodeGen/WebAssembly/i128.ll b/llvm/test/CodeGen/WebAssembly/i128.ll
index eae7f5f834dc0f..d9bec9b8ae887d 100644
--- a/llvm/test/CodeGen/WebAssembly/i128.ll
+++ b/llvm/test/CodeGen/WebAssembly/i128.ll
@@ -63,31 +63,29 @@ define i128 @mul128(i128 %x, i128 %y) {
 ; CHECK:         .functype mul128 (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push4=, __stack_pointer
-; CHECK-NEXT:    i32.const $push5=, 16
-; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
-; CHECK-NEXT:    local.tee $push8=, 5, $pop9
-; CHECK-NEXT:    global.set __stack_pointer, $pop8
-; CHECK-NEXT:    local.get $push14=, 5
-; CHECK-NEXT:    local.get $push13=, 1
-; CHECK-NEXT:    local.get $push12=, 2
-; CHECK-NEXT:    local.get $push11=, 3
-; CHECK-NEXT:    local.get $push10=, 4
-; CHECK-NEXT:    call __multi3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-NEXT:    i32.const $push3=, 16
+; CHECK-NEXT:    i32.sub $push7=, $pop2, $pop3
+; CHECK-NEXT:    local.tee $push6=, 5, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    local.get $push12=, 5
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    local.get $push9=, 3
+; CHECK-NEXT:    local.get $push8=, 4
+; CHECK-NEXT:    call __multi3, $pop12, $pop11, $pop10, $pop9, $pop8
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    local.get $push13=, 5
+; CHECK-NEXT:    i64.load $push0=, 8($pop13)
+; CHECK-NEXT:    i64.store 8($pop14), $pop0
 ; CHECK-NEXT:    local.get $push16=, 0
 ; CHECK-NEXT:    local.get $push15=, 5
-; CHECK-NEXT:    i32.const $push0=, 8
-; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
-; CHECK-NEXT:    i64.load $push2=, 0($pop1)
-; CHECK-NEXT:    i64.store 8($pop16), $pop2
-; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    i64.load $push1=, 0($pop15)
+; CHECK-NEXT:    i64.store 0($pop16), $pop1
 ; CHECK-NEXT:    local.get $push17=, 5
-; CHECK-NEXT:    i64.load $push3=, 0($pop17)
-; CHECK-NEXT:    i64.store 0($pop18), $pop3
-; CHECK-NEXT:    local.get $push19=, 5
-; CHECK-NEXT:    i32.const $push6=, 16
-; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
-; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.const $push4=, 16
+; CHECK-NEXT:    i32.add $push5=, $pop17, $pop4
+; CHECK-NEXT:    global.set __stack_pointer, $pop5
 ; CHECK-NEXT:    return
   %a = mul i128 %x, %y
   ret i128 %a
@@ -98,31 +96,29 @@ define i128 @sdiv128(i128 %x, i128 %y) {
 ; CHECK:         .functype sdiv128 (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push4=, __stack_pointer
-; CHECK-NEXT:    i32.const $push5=, 16
-; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
-; CHECK-NEXT:    local.tee $push8=, 5, $pop9
-; CHECK-NEXT:    global.set __stack_pointer, $pop8
-; CHECK-NEXT:    local.get $push14=, 5
-; CHECK-NEXT:    local.get $push13=, 1
-; CHECK-NEXT:    local.get $push12=, 2
-; CHECK-NEXT:    local.get $push11=, 3
-; CHECK-NEXT:    local.get $push10=, 4
-; CHECK-NEXT:    call __divti3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-NEXT:    i32.const $push3=, 16
+; CHECK-NEXT:    i32.sub  $push7=, $pop2, $pop3
+; CHECK-NEXT:    local.tee $push6=, 5, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    local.get $push12=, 5
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    local.get $push9=, 3
+; CHECK-NEXT:    local.get $push8=, 4
+; CHECK-NEXT:    call __divti3, $pop12, $pop11, $pop10, $pop9, $pop8
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    local.get $push13=, 5
+; CHECK-NEXT:    i64.load $push0=, 8($pop13)
+; CHECK-NEXT:    i64.store 8($pop14), $pop0
 ; CHECK-NEXT:    local.get $push16=, 0
 ; CHECK-NEXT:    local.get $push15=, 5
-; CHECK-NEXT:    i32.const $push0=, 8
-; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
-; CHECK-NEXT:    i64.load $push2=, 0($pop1)
-; CHECK-NEXT:    i64.store 8($pop16), $pop2
-; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    i64.load $push1=, 0($pop15)
+; CHECK-NEXT:    i64.store 0($pop16), $pop1
 ; CHECK-NEXT:    local.get $push17=, 5
-; CHECK-NEXT:    i64.load $push3=, 0($pop17)
-; CHECK-NEXT:    i64.store 0($pop18), $pop3
-; CHECK-NEXT:    local.get $push19=, 5
-; CHECK-NEXT:    i32.const $push6=, 16
-; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
-; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.const $push4=, 16
+; CHECK-NEXT:    i32.add  $push5=, $pop17, $pop4
+; CHECK-NEXT:    global.set __stack_pointer, $pop5
 ; CHECK-NEXT:    return
   %a = sdiv i128 %x, %y
   ret i128 %a
@@ -133,31 +129,29 @@ define i128 @udiv128(i128 %x, i128 %y) {
 ; CHECK:         .functype udiv128 (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push4=, __stack_pointer
-; CHECK-NEXT:    i32.const $push5=, 16
-; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
-; CHECK-NEXT:    local.tee $push8=, 5, $pop9
-; CHECK-NEXT:    global.set __stack_pointer, $pop8
-; CHECK-NEXT:    local.get $push14=, 5
-; CHECK-NEXT:    local.get $push13=, 1
-; CHECK-NEXT:    local.get $push12=, 2
-; CHECK-NEXT:    local.get $push11=, 3
-; CHECK-NEXT:    local.get $push10=, 4
-; CHECK-NEXT:    call __udivti3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-NEXT:    i32.const $push3=, 16
+; CHECK-NEXT:    i32.sub  $push7=, $pop2, $pop3
+; CHECK-NEXT:    local.tee $push6=, 5, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    local.get $push12=, 5
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    local.get $push9=, 3
+; CHECK-NEXT:    local.get $push8=, 4
+; CHECK-NEXT:    call __udivti3, $pop12, $pop11, $pop10, $pop9, $pop8
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    local.get $push13=, 5
+; CHECK-NEXT:    i64.load $push0=, 8($pop13)
+; CHECK-NEXT:    i64.store 8($pop14), $pop0
 ; CHECK-NEXT:    local.get $push16=, 0
 ; CHECK-NEXT:    local.get $push15=, 5
-; CHECK-NEXT:    i32.const $push0=, 8
-; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
-; CHECK-NEXT:    i64.load $push2=, 0($pop1)
-; CHECK-NEXT:    i64.store 8($pop16), $pop2
-; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    i64.load $push1=, 0($pop15)
+; CHECK-NEXT:    i64.store 0($pop16), $pop1
 ; CHECK-NEXT:    local.get $push17=, 5
-; CHECK-NEXT:    i64.load $push3=, 0($pop17)
-; CHECK-NEXT:    i64.store 0($pop18), $pop3
-; CHECK-NEXT:    local.get $push19=, 5
-; CHECK-NEXT:    i32.const $push6=, 16
-; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
-; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.const $push4=, 16
+; CHECK-NEXT:    i32.add  $push5=, $pop17, $pop4
+; CHECK-NEXT:    global.set __stack_pointer, $pop5
 ; CHECK-NEXT:    return
   %a = udiv i128 %x, %y
   ret i128 %a
@@ -168,31 +162,29 @@ define i128 @srem128(i128 %x, i128 %y) {
 ; CHECK:         .functype srem128 (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push4=, __stack_pointer
-; CHECK-NEXT:    i32.const $push5=, 16
-; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
-; CHECK-NEXT:    local.tee $push8=, 5, $pop9
-; CHECK-NEXT:    global.set __stack_pointer, $pop8
-; CHECK-NEXT:    local.get $push14=, 5
-; CHECK-NEXT:    local.get $push13=, 1
-; CHECK-NEXT:    local.get $push12=, 2
-; CHECK-NEXT:    local.get $push11=, 3
-; CHECK-NEXT:    local.get $push10=, 4
-; CHECK-NEXT:    call __modti3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-NEXT:    i32.const $push3=, 16
+; CHECK-NEXT:    i32.sub  $push7=, $pop2, $pop3
+; CHECK-NEXT:    local.tee $push6=, 5, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    local.get $push12=, 5
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    local.get $push9=, 3
+; CHECK-NEXT:    local.get $push8=, 4
+; CHECK-NEXT:    call __modti3, $pop12, $pop11, $pop10, $pop9, $pop8
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    local.get $push13=, 5
+; CHECK-NEXT:    i64.load $push0=, 8($pop13)
+; CHECK-NEXT:    i64.store 8($pop14), $pop0
 ; CHECK-NEXT:    local.get $push16=, 0
 ; CHECK-NEXT:    local.get $push15=, 5
-; CHECK-NEXT:    i32.const $push0=, 8
-; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
-; CHECK-NEXT:    i64.load $push2=, 0($pop1)
-; CHECK-NEXT:    i64.store 8($pop16), $pop2
-; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    i64.load $push1=, 0($pop15)
+; CHECK-NEXT:    i64.store 0($pop16), $pop1
 ; CHECK-NEXT:    local.get $push17=, 5
-; CHECK-NEXT:    i64.load $push3=, 0($pop17)
-; CHECK-NEXT:    i64.store 0($pop18), $pop3
-; CHECK-NEXT:    local.get $push19=, 5
-; CHECK-NEXT:    i32.const $push6=, 16
-; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
-; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.const $push4=, 16
+; CHECK-NEXT:    i32.add  $push5=, $pop17, $pop4
+; CHECK-NEXT:    global.set __stack_pointer, $pop5
 ; CHECK-NEXT:    return
   %a = srem i128 %x, %y
   ret i128 %a
@@ -203,31 +195,29 @@ define i128 @urem128(i128 %x, i128 %y) {
 ; CHECK:         .functype urem128 (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push4=, __stack_pointer
-; CHECK-NEXT:    i32.const $push5=, 16
-; CHECK-NEXT:    i32.sub $push9=, $pop4, $pop5
-; CHECK-NEXT:    local.tee $push8=, 5, $pop9
-; CHECK-NEXT:    global.set __stack_pointer, $pop8
-; CHECK-NEXT:    local.get $push14=, 5
-; CHECK-NEXT:    local.get $push13=, 1
-; CHECK-NEXT:    local.get $push12=, 2
-; CHECK-NEXT:    local.get $push11=, 3
-; CHECK-NEXT:    local.get $push10=, 4
-; CHECK-NEXT:    call __umodti3, $pop14, $pop13, $pop12, $pop11, $pop10
+; CHECK-NEXT:    global.get $push2=, __stack_pointer
+; CHECK-NEXT:    i32.const $push3=, 16
+; CHECK-NEXT:    i32.sub  $push7=, $pop2, $pop3
+; CHECK-NEXT:    local.tee $push6=, 5, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    local.get $push12=, 5
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    local.get $push9=, 3
+; CHECK-NEXT:    local.get $push8=, 4
+; CHECK-NEXT:    call __umodti3, $pop12, $pop11, $pop10, $pop9, $pop8
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    local.get $push13=, 5
+; CHECK-NEXT:    i64.load $push0=, 8($pop13)
+; CHECK-NEXT:    i64.store 8($pop14), $pop0
 ; CHECK-NEXT:    local.get $push16=, 0
 ; CHECK-NEXT:    local.get $push15=, 5
-; CHECK-NEXT:    i32.const $push0=, 8
-; CHECK-NEXT:    i32.add $push1=, $pop15, $pop0
-; CHECK-NEXT:    i64.load $push2=, 0($pop1)
-; CHECK-NEXT:    i64.store 8($pop16), $pop2
-; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    i64.load $push1=, 0($pop15)
+; CHECK-NEXT:    i64.store 0($pop16), $pop1
 ; CHECK-NEXT:    local.get $push17=, 5
-; CHECK-NEXT:    i64.load $push3=, 0($pop17)
-; CHECK-NEXT:    i64.store 0($pop18), $pop3
-; CHECK-NEXT:    local.get $push19=, 5
-; CHECK-NEXT:    i32.const $push6=, 16
-; CHECK-NEXT:    i32.add $push7=, $pop19, $pop6
-; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.const $push4=, 16
+; CHECK-NEXT:    i32.add  $push5=, $pop17, $pop4
+; CHECK-NEXT:    global.set __stack_pointer, $pop5
 ; CHECK-NEXT:    return
   %a = urem i128 %x, %y
   ret i128 %a
@@ -295,31 +285,29 @@ define i128 @shl128(i128 %x, i128 %y) {
 ; CHECK:         .functype shl128 (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push5=, __stack_pointer
-; CHECK-NEXT:    i32.const $push6=, 16
-; CHECK-NEXT:    i32.sub $push10=, $pop5, $pop6
-; CHECK-NEXT:    local.tee $push9=, 5, $pop10
-; CHECK-NEXT:    global.set __stack_pointer, $pop9
-; CHECK-NEXT:    local.get $push14=, 5
-; CHECK-NEXT:    local.get $push13=, 1
-; CHECK-NEXT:    local.get $push12=, 2
-; CHECK-NEXT:    local.get $push11=, 3
-; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop11
-; CHECK-NEXT:    call __ashlti3, $pop14, $pop13, $pop12, $pop0
+; CHECK-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-NEXT:    i32.const $push4=, 16
+; CHECK-NEXT:    i32.sub  $push8=, $pop3, $pop4
+; CHECK-NEXT:    local.tee $push7=, 5, $pop8
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    local.get $push12=, 5
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    local.get $push9=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop9
+; CHECK-NEXT:    call __ashlti3, $pop12, $pop11, $pop10, $pop0
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    local.get $push13=, 5
+; CHECK-NEXT:    i64.load $push1=, 8($pop13)
+; CHECK-NEXT:    i64.store 8($pop14), $pop1
 ; CHECK-NEXT:    local.get $push16=, 0
 ; CHECK-NEXT:    local.get $push15=, 5
-; CHECK-NEXT:    i32.const $push1=, 8
-; CHECK-NEXT:    i32.add $push2=, $pop15, $pop1
-; CHECK-NEXT:    i64.load $push3=, 0($pop2)
-; CHECK-NEXT:    i64.store 8($pop16), $pop3
-; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    i64.load $push2=, 0($pop15)
+; CHECK-NEXT:    i64.store 0($pop16), $pop2
 ; CHECK-NEXT:    local.get $push17=, 5
-; CHECK-NEXT:    i64.load $push4=, 0($pop17)
-; CHECK-NEXT:    i64.store 0($pop18), $pop4
-; CHECK-NEXT:    local.get $push19=, 5
-; CHECK-NEXT:    i32.const $push7=, 16
-; CHECK-NEXT:    i32.add $push8=, $pop19, $pop7
-; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    i32.const $push5=, 16
+; CHECK-NEXT:    i32.add  $push6=, $pop17, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
 ; CHECK-NEXT:    return
   %a = shl i128 %x, %y
   ret i128 %a
@@ -330,31 +318,29 @@ define i128 @shr128(i128 %x, i128 %y) {
 ; CHECK:         .functype shr128 (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push5=, __stack_pointer
-; CHECK-NEXT:    i32.const $push6=, 16
-; CHECK-NEXT:    i32.sub $push10=, $pop5, $pop6
-; CHECK-NEXT:    local.tee $push9=, 5, $pop10
-; CHECK-NEXT:    global.set __stack_pointer, $pop9
-; CHECK-NEXT:    local.get $push14=, 5
-; CHECK-NEXT:    local.get $push13=, 1
-; CHECK-NEXT:    local.get $push12=, 2
-; CHECK-NEXT:    local.get $push11=, 3
-; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop11
-; CHECK-NEXT:    call __lshrti3, $pop14, $pop13, $pop12, $pop0
+; CHECK-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-NEXT:    i32.const $push4=, 16
+; CHECK-NEXT:    i32.sub  $push8=, $pop3, $pop4
+; CHECK-NEXT:    local.tee $push7=, 5, $pop8
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    local.get $push12=, 5
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    local.get $push9=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop9
+; CHECK-NEXT:    call __lshrti3, $pop12, $pop11, $pop10, $pop0
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    local.get $push13=, 5
+; CHECK-NEXT:    i64.load $push1=, 8($pop13)
+; CHECK-NEXT:    i64.store 8($pop14), $pop1
 ; CHECK-NEXT:    local.get $push16=, 0
 ; CHECK-NEXT:    local.get $push15=, 5
-; CHECK-NEXT:    i32.const $push1=, 8
-; CHECK-NEXT:    i32.add $push2=, $pop15, $pop1
-; CHECK-NEXT:    i64.load $push3=, 0($pop2)
-; CHECK-NEXT:    i64.store 8($pop16), $pop3
-; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    i64.load $push2=, 0($pop15)
+; CHECK-NEXT:    i64.store 0($pop16), $pop2
 ; CHECK-NEXT:    local.get $push17=, 5
-; CHECK-NEXT:    i64.load $push4=, 0($pop17)
-; CHECK-NEXT:    i64.store 0($pop18), $pop4
-; CHECK-NEXT:    local.get $push19=, 5
-; CHECK-NEXT:    i32.const $push7=, 16
-; CHECK-NEXT:    i32.add $push8=, $pop19, $pop7
-; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    i32.const $push5=, 16
+; CHECK-NEXT:    i32.add  $push6=, $pop17, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
 ; CHECK-NEXT:    return
   %a = lshr i128 %x, %y
   ret i128 %a
@@ -365,31 +351,29 @@ define i128 @sar128(i128 %x, i128 %y) {
 ; CHECK:         .functype sar128 (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push5=, __stack_pointer
-; CHECK-NEXT:    i32.const $push6=, 16
-; CHECK-NEXT:    i32.sub $push10=, $pop5, $pop6
-; CHECK-NEXT:    local.tee $push9=, 5, $pop10
-; CHECK-NEXT:    global.set __stack_pointer, $pop9
-; CHECK-NEXT:    local.get $push14=, 5
-; CHECK-NEXT:    local.get $push13=, 1
-; CHECK-NEXT:    local.get $push12=, 2
-; CHECK-NEXT:    local.get $push11=, 3
-; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop11
-; CHECK-NEXT:    call __ashrti3, $pop14, $pop13, $pop12, $pop0
+; CHECK-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-NEXT:    i32.const $push4=, 16
+; CHECK-NEXT:    i32.sub  $push8=, $pop3, $pop4
+; CHECK-NEXT:    local.tee $push7=, 5, $pop8
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    local.get $push12=, 5
+; CHECK-NEXT:    local.get $push11=, 1
+; CHECK-NEXT:    local.get $push10=, 2
+; CHECK-NEXT:    local.get $push9=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop9
+; CHECK-NEXT:    call __ashrti3, $pop12, $pop11, $pop10, $pop0
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    local.get $push13=, 5
+; CHECK-NEXT:    i64.load $push1=, 8($pop13)
+; CHECK-NEXT:    i64.store 8($pop14), $pop1
 ; CHECK-NEXT:    local.get $push16=, 0
 ; CHECK-NEXT:    local.get $push15=, 5
-; CHECK-NEXT:    i32.const $push1=, 8
-; CHECK-NEXT:    i32.add $push2=, $pop15, $pop1
-; CHECK-NEXT:    i64.load $push3=, 0($pop2)
-; CHECK-NEXT:    i64.store 8($pop16), $pop3
-; CHECK-NEXT:    local.get $push18=, 0
+; CHECK-NEXT:    i64.load $push2=, 0($pop15)
+; CHECK-NEXT:    i64.store 0($pop16), $pop2
 ; CHECK-NEXT:    local.get $push17=, 5
-; CHECK-NEXT:    i64.load $push4=, 0($pop17)
-; CHECK-NEXT:    i64.store 0($pop18), $pop4
-; CHECK-NEXT:    local.get $push19=, 5
-; CHECK-NEXT:    i32.const $push7=, 16
-; CHECK-NEXT:    i32.add $push8=, $pop19, $pop7
-; CHECK-NEXT:    global.set __stack_pointer, $pop8
+; CHECK-NEXT:    i32.const $push5=, 16
+; CHECK-NEXT:    i32.add  $push6=, $pop17, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
 ; CHECK-NEXT:    return
   %a = ashr i128 %x, %y
   ret i128 %a
@@ -529,51 +513,45 @@ define i128 @rotl(i128 %x, i128 %y) {
 ; CHECK:         .functype rotl (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32, i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push11=, __stack_pointer
-; CHECK-NEXT:    i32.const $push12=, 32
-; CHECK-NEXT:    i32.sub $push23=, $pop11, $pop12
-; CHECK-NEXT:    local.tee $push22=, 5, $pop23
-; CHECK-NEXT:    global.set __stack_pointer, $pop22
-; CHECK-NEXT:    local.get $push24=, 5
-; CHECK-NEXT:    i32.const $push17=, 16
-; CHECK-NEXT:    i32.add $push18=, $pop24, $pop17
-; CHECK-NEXT:    local.get $push27=, 1
-; CHECK-NEXT:    local.get $push26=, 2
-; CHECK-NEXT:    local.get $push25=, 3
-; CHECK-NEXT:    i32.wrap_i64 $push21=, $pop25
-; CHECK-NEXT:    local.tee $push20=, 6, $pop21
-; CHECK-NEXT:    call __ashlti3, $pop18, $pop27, $pop26, $pop20
-; CHECK-NEXT:    local.get $push31=, 5
-; CHECK-NEXT:    local.get $push30=, 1
-; CHECK-NEXT:    local.get $push29=, 2
+; CHECK-NEXT:    global.get $push8=, __stack_pointer
+; CHECK-NEXT:    i32.const $push9=, 32
+; CHECK-NEXT:    i32.sub  $push17=, $pop8, $pop9
+; CHECK-NEXT:    local.tee $push16=, 5, $pop17
+; CHECK-NEXT:    global.set __stack_pointer, $pop16
+; CHECK-NEXT:    local.get $push18=, 5
+; CHECK-NEXT:    i32.const $push12=, 16
+; CHECK-NEXT:    i32.add  $push13=, $pop18, $pop12
+; CHECK-NEXT:    local.get $push21=, 1
+; CHECK-NEXT:    local.get $push20=, 2
+; CHECK-NEXT:    local.get $push19=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push15=, $pop19
+; CHECK-NEXT:    local.tee $push14=, 6, $pop15
+; CHECK-NEXT:    call __ashlti3, $pop13, $pop21, $pop20, $pop14
+; CHECK-NEXT:    local.get $push25=, 5
+; CHECK-NEXT:    local.get $push24=, 1
+; CHECK-NEXT:    local.get $push23=, 2
 ; CHECK-NEXT:    i32.const $push0=, 128
-; CHECK-NEXT:    local.get $push28=, 6
-; CHECK-NEXT:    i32.sub $push1=, $pop0, $pop28
-; CHECK-NEXT:    call __lshrti3, $pop31, $pop30, $pop29, $pop1
-; CHECK-NEXT:    local.get $push34=, 0
+; CHECK-NEXT:    local.get $push22=, 6
+; CHECK-NEXT:    i32.sub  $push1=, $pop0, $pop22
+; CHECK-NEXT:    call __lshrti3, $pop25, $pop24, $pop23, $pop1
+; CHECK-NEXT:    local.get $push28=, 0
+; CHECK-NEXT:    local.get $push26=, 5
+; CHECK-NEXT:    i64.load $push2=, 24($pop26)
+; CHECK-NEXT:    local.get $push27=, 5
+; CHECK-NEXT:    i64.load $push3=, 8($pop27)
+; CHECK-NEXT:    i64.or   $push4=, $pop2, $pop3
+; CHECK-NEXT:    i64.store 8($pop28), $pop4
+; CHECK-NEXT:    local.get $push31=, 0
+; CHECK-NEXT:    local.get $push29=, 5
+; CHECK-NEXT:    i64.load $push5=, 16($pop29)
+; CHECK-NEXT:    local.get $push30=, 5
+; CHECK-NEXT:    i64.load $push6=, 0($pop30)
+; CHECK-NEXT:    i64.or   $push7=, $pop5, $pop6
+; CHECK-NEXT:    i64.store 0($pop31), $pop7
 ; CHECK-NEXT:    local.get $push32=, 5
-; CHECK-NEXT:    i32.const $push15=, 16
-; CHECK-NEXT:    i32.add $push16=, $pop32, $pop15
-; CHECK-NEXT:    i32.const $push2=, 8
-; CHECK-NEXT:    i32.add $push3=, $pop16, $pop2
-; CHECK-NEXT:    i64.load $push4=, 0($pop3)
-; CHECK-NEXT:    local.get $push33=, 5
-; CHECK-NEXT:    i32.const $push19=, 8
-; CHECK-NEXT:    i32.add $push5=, $pop33, $pop19
-; CHECK-NEXT:    i64.load $push6=, 0($pop5)
-; CHECK-NEXT:    i64.or $push7=, $pop4, $pop6
-; CHECK-NEXT:    i64.store 8($pop34), $pop7
-; CHECK-NEXT:    local.get $push37=, 0
-; CHECK-NEXT:    local.get $push35=, 5
-; CHECK-NEXT:    i64.load $push8=, 16($pop35)
-; CHECK-NEXT:    local.get $push36=, 5
-; CHECK-NEXT:    i64.load $push9=, 0($pop36)
-; CHECK-NEXT:    i64.or $push10=, $pop8, $pop9
-; CHECK-NEXT:    i64.store 0($pop37), $pop10
-; CHECK-NEXT:    local.get $push38=, 5
-; CHECK-NEXT:    i32.const $push13=, 32
-; CHECK-NEXT:    i32.add $push14=, $pop38, $pop13
-; CHECK-NEXT:    global.set __stack_pointer, $pop14
+; CHECK-NEXT:    i32.const $push10=, 32
+; CHECK-NEXT:    i32.add  $push11=, $pop32, $pop10
+; CHECK-NEXT:    global.set __stack_pointer, $pop11
 ; CHECK-NEXT:    return
   %z = sub i128 128, %y
   %b = shl i128 %x, %y
@@ -587,53 +565,47 @@ define i128 @masked_rotl(i128 %x, i128 %y) {
 ; CHECK:         .functype masked_rotl (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32, i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push13=, __stack_pointer
-; CHECK-NEXT:    i32.const $push14=, 32
-; CHECK-NEXT:    i32.sub $push25=, $pop13, $pop14
-; CHECK-NEXT:    local.tee $push24=, 5, $pop25
-; CHECK-NEXT:    global.set __stack_pointer, $pop24
-; CHECK-NEXT:    local.get $push26=, 5
-; CHECK-NEXT:    i32.const $push19=, 16
-; CHECK-NEXT:    i32.add $push20=, $pop26, $pop19
-; CHECK-NEXT:    local.get $push29=, 1
-; CHECK-NEXT:    local.get $push28=, 2
-; CHECK-NEXT:    local.get $push27=, 3
-; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop27
+; CHECK-NEXT:    global.get $push10=, __stack_pointer
+; CHECK-NEXT:    i32.const $push11=, 32
+; CHECK-NEXT:    i32.sub  $push19=, $pop10, $pop11
+; CHECK-NEXT:    local.tee $push18=, 5, $pop19
+; CHECK-NEXT:    global.set __stack_pointer, $pop18
+; CHECK-NEXT:    local.get $push20=, 5
+; CHECK-NEXT:    i32.const $push14=, 16
+; CHECK-NEXT:    i32.add  $push15=, $pop20, $pop14
+; CHECK-NEXT:    local.get $push23=, 1
+; CHECK-NEXT:    local.get $push22=, 2
+; CHECK-NEXT:    local.get $push21=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop21
 ; CHECK-NEXT:    i32.const $push1=, 127
-; CHECK-NEXT:    i32.and $push23=, $pop0, $pop1
-; CHECK-NEXT:    local.tee $push22=, 6, $pop23
-; CHECK-NEXT:    call __ashlti3, $pop20, $pop29, $pop28, $pop22
-; CHECK-NEXT:    local.get $push33=, 5
-; CHECK-NEXT:    local.get $push32=, 1
-; CHECK-NEXT:    local.get $push31=, 2
+; CHECK-NEXT:    i32.and  $push17=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push16=, 6, $pop17
+; CHECK-NEXT:    call __ashlti3, $pop15, $pop23, $pop22, $pop16
+; CHECK-NEXT:    local.get $push27=, 5
+; CHECK-NEXT:    local.get $push26=, 1
+; CHECK-NEXT:    local.get $push25=, 2
 ; CHECK-NEXT:    i32.const $push2=, 128
-; CHECK-NEXT:    local.get $push30=, 6
-; CHECK-NEXT:    i32.sub $push3=, $pop2, $pop30
-; CHECK-NEXT:    call __lshrti3, $pop33, $pop32, $pop31, $pop3
-; CHECK-NEXT:    local.get $push36=, 0
+; CHECK-NEXT:    local.get $push24=, 6
+; CHECK-NEXT:    i32.sub  $push3=, $pop2, $pop24
+; CHECK-NEXT:    call __lshrti3, $pop27, $pop26, $pop25, $pop3
+; CHECK-NEXT:    local.get $push30=, 0
+; CHECK-NEXT:    local.get $push28=, 5
+; CHECK-NEXT:    i64.load $push4=, 24($pop28)
+; CHECK-NEXT:    local.get $push29=, 5
+; CHECK-NEXT:    i64.load $push5=, 8($pop29)
+; CHECK-NEXT:    i64.or   $push6=, $pop4, $pop5
+; CHECK-NEXT:    i64.store 8($pop30), $pop6
+; CHECK-NEXT:    local.get $push33=, 0
+; CHECK-NEXT:    local.get $push31=, 5
+; CHECK-NEXT:    i64.load $push7=, 16($pop31)
+; CHECK-NEXT:    local.get $push32=, 5
+; CHECK-NEXT:    i64.load $push8=, 0($pop32)
+; CHECK-NEXT:    i64.or   $push9=, $pop7, $pop8
+; CHECK-NEXT:    i64.store 0($pop33), $pop9
 ; CHECK-NEXT:    local.get $push34=, 5
-; CHECK-NEXT:    i32.const $push17=, 16
-; CHECK-NEXT:    i32.add $push18=, $pop34, $pop17
-; CHECK-NEXT:    i32.const $push4=, 8
-; CHECK-NEXT:    i32.add $push5=, $pop18, $pop4
-; CHECK-NEXT:    i64.load $push6=, 0($pop5)
-; CHECK-NEXT:    local.get $push35=, 5
-; CHECK-NEXT:    i32.const $push21=, 8
-; CHECK-NEXT:    i32.add $push7=, $pop35, $pop21
-; CHECK-NEXT:    i64.load $push8=, 0($pop7)
-; CHECK-NEXT:    i64.or $push9=, $pop6, $pop8
-; CHECK-NEXT:    i64.store 8($pop36), $pop9
-; CHECK-NEXT:    local.get $push39=, 0
-; CHECK-NEXT:    local.get $push37=, 5
-; CHECK-NEXT:    i64.load $push10=, 16($pop37)
-; CHECK-NEXT:    local.get $push38=, 5
-; CHECK-NEXT:    i64.load $push11=, 0($pop38)
-; CHECK-NEXT:    i64.or $push12=, $pop10, $pop11
-; CHECK-NEXT:    i64.store 0($pop39), $pop12
-; CHECK-NEXT:    local.get $push40=, 5
-; CHECK-NEXT:    i32.const $push15=, 32
-; CHECK-NEXT:    i32.add $push16=, $pop40, $pop15
-; CHECK-NEXT:    global.set __stack_pointer, $pop16
+; CHECK-NEXT:    i32.const $push12=, 32
+; CHECK-NEXT:    i32.add  $push13=, $pop34, $pop12
+; CHECK-NEXT:    global.set __stack_pointer, $pop13
 ; CHECK-NEXT:    return
   %a = and i128 %y, 127
   %z = sub i128 128, %a
@@ -648,51 +620,45 @@ define i128 @rotr(i128 %x, i128 %y) {
 ; CHECK:         .functype rotr (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32, i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push11=, __stack_pointer
-; CHECK-NEXT:    i32.const $push12=, 32
-; CHECK-NEXT:    i32.sub $push23=, $pop11, $pop12
-; CHECK-NEXT:    local.tee $push22=, 5, $pop23
-; CHECK-NEXT:    global.set __stack_pointer, $pop22
-; CHECK-NEXT:    local.get $push24=, 5
-; CHECK-NEXT:    i32.const $push17=, 16
-; CHECK-NEXT:    i32.add $push18=, $pop24, $pop17
-; CHECK-NEXT:    local.get $push27=, 1
-; CHECK-NEXT:    local.get $push26=, 2
-; CHECK-NEXT:    local.get $push25=, 3
-; CHECK-NEXT:    i32.wrap_i64 $push21=, $pop25
-; CHECK-NEXT:    local.tee $push20=, 6, $pop21
-; CHECK-NEXT:    call __lshrti3, $pop18, $pop27, $pop26, $pop20
-; CHECK-NEXT:    local.get $push31=, 5
-; CHECK-NEXT:    local.get $push30=, 1
-; CHECK-NEXT:    local.get $push29=, 2
+; CHECK-NEXT:    global.get $push8=, __stack_pointer
+; CHECK-NEXT:    i32.const $push9=, 32
+; CHECK-NEXT:    i32.sub  $push17=, $pop8, $pop9
+; CHECK-NEXT:    local.tee $push16=, 5, $pop17
+; CHECK-NEXT:    global.set __stack_pointer, $pop16
+; CHECK-NEXT:    local.get $push18=, 5
+; CHECK-NEXT:    i32.const $push12=, 16
+; CHECK-NEXT:    i32.add  $push13=, $pop18, $pop12
+; CHECK-NEXT:    local.get $push21=, 1
+; CHECK-NEXT:    local.get $push20=, 2
+; CHECK-NEXT:    local.get $push19=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push15=, $pop19
+; CHECK-NEXT:    local.tee $push14=, 6, $pop15
+; CHECK-NEXT:    call __lshrti3, $pop13, $pop21, $pop20, $pop14
+; CHECK-NEXT:    local.get $push25=, 5
+; CHECK-NEXT:    local.get $push24=, 1
+; CHECK-NEXT:    local.get $push23=, 2
 ; CHECK-NEXT:    i32.const $push0=, 128
-; CHECK-NEXT:    local.get $push28=, 6
-; CHECK-NEXT:    i32.sub $push1=, $pop0, $pop28
-; CHECK-NEXT:    call __ashlti3, $pop31, $pop30, $pop29, $pop1
-; CHECK-NEXT:    local.get $push34=, 0
+; CHECK-NEXT:    local.get $push22=, 6
+; CHECK-NEXT:    i32.sub  $push1=, $pop0, $pop22
+; CHECK-NEXT:    call __ashlti3, $pop25, $pop24, $pop23, $pop1
+; CHECK-NEXT:    local.get $push28=, 0
+; CHECK-NEXT:    local.get $push26=, 5
+; CHECK-NEXT:    i64.load $push2=, 24($pop26)
+; CHECK-NEXT:    local.get $push27=, 5
+; CHECK-NEXT:    i64.load $push3=, 8($pop27)
+; CHECK-NEXT:    i64.or   $push4=, $pop2, $pop3
+; CHECK-NEXT:    i64.store 8($pop28), $pop4
+; CHECK-NEXT:    local.get $push31=, 0
+; CHECK-NEXT:    local.get $push29=, 5
+; CHECK-NEXT:    i64.load $push5=, 16($pop29)
+; CHECK-NEXT:    local.get $push30=, 5
+; CHECK-NEXT:    i64.load $push6=, 0($pop30)
+; CHECK-NEXT:    i64.or   $push7=, $pop5, $pop6
+; CHECK-NEXT:    i64.store 0($pop31), $pop7
 ; CHECK-NEXT:    local.get $push32=, 5
-; CHECK-NEXT:    i32.const $push15=, 16
-; CHECK-NEXT:    i32.add $push16=, $pop32, $pop15
-; CHECK-NEXT:    i32.const $push2=, 8
-; CHECK-NEXT:    i32.add $push3=, $pop16, $pop2
-; CHECK-NEXT:    i64.load $push4=, 0($pop3)
-; CHECK-NEXT:    local.get $push33=, 5
-; CHECK-NEXT:    i32.const $push19=, 8
-; CHECK-NEXT:    i32.add $push5=, $pop33, $pop19
-; CHECK-NEXT:    i64.load $push6=, 0($pop5)
-; CHECK-NEXT:    i64.or $push7=, $pop4, $pop6
-; CHECK-NEXT:    i64.store 8($pop34), $pop7
-; CHECK-NEXT:    local.get $push37=, 0
-; CHECK-NEXT:    local.get $push35=, 5
-; CHECK-NEXT:    i64.load $push8=, 16($pop35)
-; CHECK-NEXT:    local.get $push36=, 5
-; CHECK-NEXT:    i64.load $push9=, 0($pop36)
-; CHECK-NEXT:    i64.or $push10=, $pop8, $pop9
-; CHECK-NEXT:    i64.store 0($pop37), $pop10
-; CHECK-NEXT:    local.get $push38=, 5
-; CHECK-NEXT:    i32.const $push13=, 32
-; CHECK-NEXT:    i32.add $push14=, $pop38, $pop13
-; CHECK-NEXT:    global.set __stack_pointer, $pop14
+; CHECK-NEXT:    i32.const $push10=, 32
+; CHECK-NEXT:    i32.add  $push11=, $pop32, $pop10
+; CHECK-NEXT:    global.set __stack_pointer, $pop11
 ; CHECK-NEXT:    return
   %z = sub i128 128, %y
   %b = lshr i128 %x, %y
@@ -706,53 +672,47 @@ define i128 @masked_rotr(i128 %x, i128 %y) {
 ; CHECK:         .functype masked_rotr (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32, i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push13=, __stack_pointer
-; CHECK-NEXT:    i32.const $push14=, 32
-; CHECK-NEXT:    i32.sub $push25=, $pop13, $pop14
-; CHECK-NEXT:    local.tee $push24=, 5, $pop25
-; CHECK-NEXT:    global.set __stack_pointer, $pop24
-; CHECK-NEXT:    local.get $push26=, 5
-; CHECK-NEXT:    i32.const $push19=, 16
-; CHECK-NEXT:    i32.add $push20=, $pop26, $pop19
-; CHECK-NEXT:    local.get $push29=, 1
-; CHECK-NEXT:    local.get $push28=, 2
-; CHECK-NEXT:    local.get $push27=, 3
-; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop27
+; CHECK-NEXT:    global.get $push10=, __stack_pointer
+; CHECK-NEXT:    i32.const $push11=, 32
+; CHECK-NEXT:    i32.sub  $push19=, $pop10, $pop11
+; CHECK-NEXT:    local.tee $push18=, 5, $pop19
+; CHECK-NEXT:    global.set __stack_pointer, $pop18
+; CHECK-NEXT:    local.get $push20=, 5
+; CHECK-NEXT:    i32.const $push14=, 16
+; CHECK-NEXT:    i32.add  $push15=, $pop20, $pop14
+; CHECK-NEXT:    local.get $push23=, 1
+; CHECK-NEXT:    local.get $push22=, 2
+; CHECK-NEXT:    local.get $push21=, 3
+; CHECK-NEXT:    i32.wrap_i64 $push0=, $pop21
 ; CHECK-NEXT:    i32.const $push1=, 127
-; CHECK-NEXT:    i32.and $push23=, $pop0, $pop1
-; CHECK-NEXT:    local.tee $push22=, 6, $pop23
-; CHECK-NEXT:    call __lshrti3, $pop20, $pop29, $pop28, $pop22
-; CHECK-NEXT:    local.get $push33=, 5
-; CHECK-NEXT:    local.get $push32=, 1
-; CHECK-NEXT:    local.get $push31=, 2
+; CHECK-NEXT:    i32.and  $push17=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push16=, 6, $pop17
+; CHECK-NEXT:    call __lshrti3, $pop15, $pop23, $pop22, $pop16
+; CHECK-NEXT:    local.get $push27=, 5
+; CHECK-NEXT:    local.get $push26=, 1
+; CHECK-NEXT:    local.get $push25=, 2
 ; CHECK-NEXT:    i32.const $push2=, 128
-; CHECK-NEXT:    local.get $push30=, 6
-; CHECK-NEXT:    i32.sub $push3=, $pop2, $pop30
-; CHECK-NEXT:    call __ashlti3, $pop33, $pop32, $pop31, $pop3
-; CHECK-NEXT:    local.get $push36=, 0
+; CHECK-NEXT:    local.get $push24=, 6
+; CHECK-NEXT:    i32.sub  $push3=, $pop2, $pop24
+; CHECK-NEXT:    call __ashlti3, $pop27, $pop26, $pop25, $pop3
+; CHECK-NEXT:    local.get $push30=, 0
+; CHECK-NEXT:    local.get $push28=, 5
+; CHECK-NEXT:    i64.load $push4=, 24($pop28)
+; CHECK-NEXT:    local.get $push29=, 5
+; CHECK-NEXT:    i64.load $push5=, 8($pop29)
+; CHECK-NEXT:    i64.or   $push6=, $pop4, $pop5
+; CHECK-NEXT:    i64.store 8($pop30), $pop6
+; CHECK-NEXT:    local.get $push33=, 0
+; CHECK-NEXT:    local.get $push31=, 5
+; CHECK-NEXT:    i64.load $push7=, 16($pop31)
+; CHECK-NEXT:    local.get $push32=, 5
+; CHECK-NEXT:    i64.load $push8=, 0($pop32)
+; CHECK-NEXT:    i64.or   $push9=, $pop7, $pop8
+; CHECK-NEXT:    i64.store 0($pop33), $pop9
 ; CHECK-NEXT:    local.get $push34=, 5
-; CHECK-NEXT:    i32.const $push17=, 16
-; CHECK-NEXT:    i32.add $push18=, $pop34, $pop17
-; CHECK-NEXT:    i32.const $push4=, 8
-; CHECK-NEXT:    i32.add $push5=, $pop18, $pop4
-; CHECK-NEXT:    i64.load $push6=, 0($pop5)
-; CHECK-NEXT:    local.get $push35=, 5
-; CHECK-NEXT:    i32.const $push21=, 8
-; CHECK-NEXT:    i32.add $push7=, $pop35, $pop21
-; CHECK-NEXT:    i64.load $push8=, 0($pop7)
-; CHECK-NEXT:    i64.or $push9=, $pop6, $pop8
-; CHECK-NEXT:    i64.store 8($pop36), $pop9
-; CHECK-NEXT:    local.get $push39=, 0
-; CHECK-NEXT:    local.get $push37=, 5
-; CHECK-NEXT:    i64.load $push10=, 16($pop37)
-; CHECK-NEXT:    local.get $push38=, 5
-; CHECK-NEXT:    i64.load $push11=, 0($pop38)
-; CHECK-NEXT:    i64.or $push12=, $pop10, $pop11
-; CHECK-NEXT:    i64.store 0($pop39), $pop12
-; CHECK-NEXT:    local.get $push40=, 5
-; CHECK-NEXT:    i32.const $push15=, 32
-; CHECK-NEXT:    i32.add $push16=, $pop40, $pop15
-; CHECK-NEXT:    global.set __stack_pointer, $pop16
+; CHECK-NEXT:    i32.const $push12=, 32
+; CHECK-NEXT:    i32.add  $push13=, $pop34, $pop12
+; CHECK-NEXT:    global.set __stack_pointer, $pop13
 ; CHECK-NEXT:    return
   %a = and i128 %y, 127
   %z = sub i128 128, %a
diff --git a/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll b/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll
index 7850559b49b7d7..042009b9365e7d 100644
--- a/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll
+++ b/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll
@@ -45,139 +45,101 @@ define fp128 @fp128libcalls(fp128 %x) {
 ; CHECK:         .functype fp128libcalls (i32, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push31=, __stack_pointer
-; CHECK-NEXT:    i32.const $push32=, 160
-; CHECK-NEXT:    i32.sub $push81=, $pop31, $pop32
-; CHECK-NEXT:    local.tee $push80=, 3, $pop81
-; CHECK-NEXT:    global.set __stack_pointer, $pop80
-; CHECK-NEXT:    local.get $push82=, 3
-; CHECK-NEXT:    i32.const $push69=, 144
-; CHECK-NEXT:    i32.add $push70=, $pop82, $pop69
-; CHECK-NEXT:    local.get $push84=, 1
-; CHECK-NEXT:    local.get $push83=, 2
-; CHECK-NEXT:    call sinl, $pop70, $pop84, $pop83
-; CHECK-NEXT:    local.get $push85=, 3
-; CHECK-NEXT:    i32.const $push65=, 128
-; CHECK-NEXT:    i32.add $push66=, $pop85, $pop65
-; CHECK-NEXT:    local.get $push86=, 3
-; CHECK-NEXT:    i64.load $push3=, 144($pop86)
-; CHECK-NEXT:    local.get $push87=, 3
-; CHECK-NEXT:    i32.const $push67=, 144
-; CHECK-NEXT:    i32.add $push68=, $pop87, $pop67
-; CHECK-NEXT:    i32.const $push0=, 8
-; CHECK-NEXT:    i32.add $push1=, $pop68, $pop0
-; CHECK-NEXT:    i64.load $push2=, 0($pop1)
-; CHECK-NEXT:    call cosl, $pop66, $pop3, $pop2
-; CHECK-NEXT:    local.get $push88=, 3
-; CHECK-NEXT:    i32.const $push61=, 112
-; CHECK-NEXT:    i32.add $push62=, $pop88, $pop61
-; CHECK-NEXT:    local.get $push89=, 3
-; CHECK-NEXT:    i64.load $push6=, 128($pop89)
-; CHECK-NEXT:    local.get $push90=, 3
-; CHECK-NEXT:    i32.const $push63=, 128
-; CHECK-NEXT:    i32.add $push64=, $pop90, $pop63
-; CHECK-NEXT:    i32.const $push79=, 8
-; CHECK-NEXT:    i32.add $push4=, $pop64, $pop79
-; CHECK-NEXT:    i64.load $push5=, 0($pop4)
-; CHECK-NEXT:    call tanl, $pop62, $pop6, $pop5
-; CHECK-NEXT:    local.get $push91=, 3
-; CHECK-NEXT:    i32.const $push57=, 96
-; CHECK-NEXT:    i32.add $push58=, $pop91, $pop57
-; CHECK-NEXT:    local.get $push92=, 3
-; CHECK-NEXT:    i64.load $push9=, 112($pop92)
-; CHECK-NEXT:    local.get $push93=, 3
-; CHECK-NEXT:    i32.const $push59=, 112
-; CHECK-NEXT:    i32.add $push60=, $pop93, $pop59
-; CHECK-NEXT:    i32.const $push78=, 8
-; CHECK-NEXT:    i32.add $push7=, $pop60, $pop78
-; CHECK-NEXT:    i64.load $push8=, 0($pop7)
-; CHECK-NEXT:    call asinl, $pop58, $pop9, $pop8
-; CHECK-NEXT:    local.get $push94=, 3
-; CHECK-NEXT:    i32.const $push53=, 80
-; CHECK-NEXT:    i32.add $push54=, $pop94, $pop53
-; CHECK-NEXT:    local.get $push95=, 3
-; CHECK-NEXT:    i64.load $push12=, 96($pop95)
-; CHECK-NEXT:    local.get $push96=, 3
-; CHECK-NEXT:    i32.const $push55=, 96
-; CHECK-NEXT:    i32.add $push56=, $pop96, $pop55
-; CHECK-NEXT:    i32.const $push77=, 8
-; CHECK-NEXT:    i32.add $push10=, $pop56, $pop77
-; CHECK-NEXT:    i64.load $push11=, 0($pop10)
-; CHECK-NEXT:    call acosl, $pop54, $pop12, $pop11
-; CHECK-NEXT:    local.get $push97=, 3
-; CHECK-NEXT:    i32.const $push49=, 64
-; CHECK-NEXT:    i32.add $push50=, $pop97, $pop49
-; CHECK-NEXT:    local.get $push98=, 3
-; CHECK-NEXT:    i64.load $push15=, 80($pop98)
-; CHECK-NEXT:    local.get $push99=, 3
-; CHECK-NEXT:    i32.const $push51=, 80
-; CHECK-NEXT:    i32.add $push52=, $pop99, $pop51
-; CHECK-NEXT:    i32.const $push76=, 8
-; CHECK-NEXT:    i32.add $push13=, $pop52, $pop76
-; CHECK-NEXT:    i64.load $push14=, 0($pop13)
-; CHECK-NEXT:    call atanl, $pop50, $pop15, $pop14
-; CHECK-NEXT:    local.get $push100=, 3
-; CHECK-NEXT:    i32.const $push45=, 48
-; CHECK-NEXT:    i32.add $push46=, $pop100, $pop45
-; CHECK-NEXT:    local.get $push101=, 3
-; CHECK-NEXT:    i64.load $push18=, 64($pop101)
-; CHECK-NEXT:    local.get $push102=, 3
-; CHECK-NEXT:    i32.const $push47=, 64
-; CHECK-NEXT:    i32.add $push48=, $pop102, $pop47
-; CHECK-NEXT:    i32.const $push75=, 8
-; CHECK-NEXT:    i32.add $push16=, $pop48, $pop75
-; CHECK-NEXT:    i64.load $push17=, 0($pop16)
-; CHECK-NEXT:    call sinhl, $pop46, $pop18, $pop17
-; CHECK-NEXT:    local.get $push103=, 3
-; CHECK-NEXT:    i32.const $push41=, 32
-; CHECK-NEXT:    i32.add $push42=, $pop103, $pop41
-; CHECK-NEXT:    local.get $push104=, 3
-; CHECK-NEXT:    i64.load $push21=, 48($pop104)
-; CHECK-NEXT:    local.get $push105=, 3
-; CHECK-NEXT:    i32.const $push43=, 48
-; CHECK-NEXT:    i32.add $push44=, $pop105, $pop43
-; CHECK-NEXT:    i32.const $push74=, 8
-; CHECK-NEXT:    i32.add $push19=, $pop44, $pop74
-; CHECK-NEXT:    i64.load $push20=, 0($pop19)
-; CHECK-NEXT:    call coshl, $pop42, $pop21, $pop20
-; CHECK-NEXT:    local.get $push106=, 3
-; CHECK-NEXT:    i32.const $push37=, 16
-; CHECK-NEXT:    i32.add $push38=, $pop106, $pop37
-; CHECK-NEXT:    local.get $push107=, 3
-; CHECK-NEXT:    i64.load $push24=, 32($pop107)
-; CHECK-NEXT:    local.get $push108=, 3
-; CHECK-NEXT:    i32.const $push39=, 32
-; CHECK-NEXT:    i32.add $push40=, $pop108, $pop39
-; CHECK-NEXT:    i32.const $push73=, 8
-; CHECK-NEXT:    i32.add $push22=, $pop40, $pop73
-; CHECK-NEXT:    i64.load $push23=, 0($pop22)
-; CHECK-NEXT:    call tanhl, $pop38, $pop24, $pop23
-; CHECK-NEXT:    local.get $push113=, 3
-; CHECK-NEXT:    local.get $push112=, 1
-; CHECK-NEXT:    local.get $push111=, 2
-; CHECK-NEXT:    local.get $push109=, 3
-; CHECK-NEXT:    i64.load $push27=, 16($pop109)
-; CHECK-NEXT:    local.get $push110=, 3
-; CHECK-NEXT:    i32.const $push35=, 16
-; CHECK-NEXT:    i32.add $push36=, $pop110, $pop35
-; CHECK-NEXT:    i32.const $push72=, 8
-; CHECK-NEXT:    i32.add $push25=, $pop36, $pop72
-; CHECK-NEXT:    i64.load $push26=, 0($pop25)
-; CHECK-NEXT:    call atan2l, $pop113, $pop112, $pop111, $pop27, $pop26
-; CHECK-NEXT:    local.get $push115=, 0
-; CHECK-NEXT:    local.get $push114=, 3
-; CHECK-NEXT:    i32.const $push71=, 8
-; CHECK-NEXT:    i32.add $push28=, $pop114, $pop71
-; CHECK-NEXT:    i64.load $push29=, 0($pop28)
-; CHECK-NEXT:    i64.store 8($pop115), $pop29
-; CHECK-NEXT:    local.get $push117=, 0
-; CHECK-NEXT:    local.get $push116=, 3
-; CHECK-NEXT:    i64.load $push30=, 0($pop116)
-; CHECK-NEXT:    i64.store 0($pop117), $pop30
-; CHECK-NEXT:    local.get $push118=, 3
-; CHECK-NEXT:    i32.const $push33=, 160
-; CHECK-NEXT:    i32.add $push34=, $pop118, $pop33
-; CHECK-NEXT:    global.set __stack_pointer, $pop34
+; CHECK-NEXT:    global.get $push20=, __stack_pointer
+; CHECK-NEXT:    i32.const $push21=, 160
+; CHECK-NEXT:    i32.sub  $push43=, $pop20, $pop21
+; CHECK-NEXT:    local.tee $push42=, 3, $pop43
+; CHECK-NEXT:    global.set __stack_pointer, $pop42
+; CHECK-NEXT:    local.get $push44=, 3
+; CHECK-NEXT:    i32.const $push40=, 144
+; CHECK-NEXT:    i32.add  $push41=, $pop44, $pop40
+; CHECK-NEXT:    local.get $push46=, 1
+; CHECK-NEXT:    local.get $push45=, 2
+; CHECK-NEXT:    call sinl, $pop41, $pop46, $pop45
+; CHECK-NEXT:    local.get $push47=, 3
+; CHECK-NEXT:    i32.const $push38=, 128
+; CHECK-NEXT:    i32.add  $push39=, $pop47, $pop38
+; CHECK-NEXT:    local.get $push48=, 3
+; CHECK-NEXT:    i64.load $push1=, 144($pop48)
+; CHECK-NEXT:    local.get $push49=, 3
+; CHECK-NEXT:    i64.load $push0=, 152($pop49)
+; CHECK-NEXT:    call cosl, $pop39, $pop1, $pop0
+; CHECK-NEXT:    local.get $push50=, 3
+; CHECK-NEXT:    i32.const $push36=, 112
+; CHECK-NEXT:    i32.add  $push37=, $pop50, $pop36
+; CHECK-NEXT:    local.get $push51=, 3
+; CHECK-NEXT:    i64.load $push3=, 128($pop51)
+; CHECK-NEXT:    local.get $push52=, 3
+; CHECK-NEXT:    i64.load $push2=, 136($pop52)
+; CHECK-NEXT:    call tanl, $pop37, $pop3, $pop2
+; CHECK-NEXT:    local.get $push53=, 3
+; CHECK-NEXT:    i32.const $push34=, 96
+; CHECK-NEXT:    i32.add  $push35=, $pop53, $pop34
+; CHECK-NEXT:    local.get $push54=, 3
+; CHECK-NEXT:    i64.load $push5=, 112($pop54)
+; CHECK-NEXT:    local.get $push55=, 3
+; CHECK-NEXT:    i64.load $push4=, 120($pop55)
+; CHECK-NEXT:    call asinl, $pop35, $pop5, $pop4
+; CHECK-NEXT:    local.get $push56=, 3
+; CHECK-NEXT:    i32.const $push32=, 80
+; CHECK-NEXT:    i32.add  $push33=, $pop56, $pop32
+; CHECK-NEXT:    local.get $push57=, 3
+; CHECK-NEXT:    i64.load $push7=, 96($pop57)
+; CHECK-NEXT:    local.get $push58=, 3
+; CHECK-NEXT:    i64.load $push6=, 104($pop58)
+; CHECK-NEXT:    call acosl, $pop33, $pop7, $pop6
+; CHECK-NEXT:    local.get $push59=, 3
+; CHECK-NEXT:    i32.const $push30=, 64
+; CHECK-NEXT:    i32.add  $push31=, $pop59, $pop30
+; CHECK-NEXT:    local.get $push60=, 3
+; CHECK-NEXT:    i64.load $push9=, 80($pop60)
+; CHECK-NEXT:    local.get $push61=, 3
+; CHECK-NEXT:    i64.load $push8=, 88($pop61)
+; CHECK-NEXT:    call atanl, $pop31, $pop9, $pop8
+; CHECK-NEXT:    local.get $push62=, 3
+; CHECK-NEXT:    i32.const $push28=, 48
+; CHECK-NEXT:    i32.add  $push29=, $pop62, $pop28
+; CHECK-NEXT:    local.get $push63=, 3
+; CHECK-NEXT:    i64.load $push11=, 64($pop63)
+; CHECK-NEXT:    local.get $push64=, 3
+; CHECK-NEXT:    i64.load $push10=, 72($pop64)
+; CHECK-NEXT:    call sinhl, $pop29, $pop11, $pop10
+; CHECK-NEXT:    local.get $push65=, 3
+; CHECK-NEXT:    i32.const $push26=, 32
+; CHECK-NEXT:    i32.add  $push27=, $pop65, $pop26
+; CHECK-NEXT:    local.get $push66=, 3
+; CHECK-NEXT:    i64.load $push13=, 48($pop66)
+; CHECK-NEXT:    local.get $push67=, 3
+; CHECK-NEXT:    i64.load $push12=, 56($pop67)
+; CHECK-NEXT:    call coshl, $pop27, $pop13, $pop12
+; CHECK-NEXT:    local.get $push68=, 3
+; CHECK-NEXT:    i32.const $push24=, 16
+; CHECK-NEXT:    i32.add  $push25=, $pop68, $pop24
+; CHECK-NEXT:    local.get $push69=, 3
+; CHECK-NEXT:    i64.load $push15=, 32($pop69)
+; CHECK-NEXT:    local.get $push70=, 3
+; CHECK-NEXT:    i64.load $push14=, 40($pop70)
+; CHECK-NEXT:    call tanhl, $pop25, $pop15, $pop14
+; CHECK-NEXT:    local.get $push75=, 3
+; CHECK-NEXT:    local.get $push74=, 1
+; CHECK-NEXT:    local.get $push73=, 2
+; CHECK-NEXT:    local.get $push71=, 3
+; CHECK-NEXT:    i64.load $push17=, 16($pop71)
+; CHECK-NEXT:    local.get $push72=, 3
+; CHECK-NEXT:    i64.load $push16=, 24($pop72)
+; CHECK-NEXT:    call atan2l, $pop75, $pop74, $pop73, $pop17, $pop16
+; CHECK-NEXT:    local.get $push77=, 0
+; CHECK-NEXT:    local.get $push76=, 3
+; CHECK-NEXT:    i64.load $push18=, 8($pop76)
+; CHECK-NEXT:    i64.store 8($pop77), $pop18
+; CHECK-NEXT:    local.get $push79=, 0
+; CHECK-NEXT:    local.get $push78=, 3
+; CHECK-NEXT:    i64.load $push19=, 0($pop78)
+; CHECK-NEXT:    i64.store 0($pop79), $pop19
+; CHECK-NEXT:    local.get $push80=, 3
+; CHECK-NEXT:    i32.const $push22=, 160
+; CHECK-NEXT:    i32.add  $push23=, $pop80, $pop22
+; CHECK-NEXT:    global.set __stack_pointer, $pop23
 ; CHECK-NEXT:    return
   ; libm calls
   %d = call fp128 @llvm.sin.f128(fp128 %x)
diff --git a/llvm/test/CodeGen/WebAssembly/libcalls.ll b/llvm/test/CodeGen/WebAssembly/libcalls.ll
index 70f000664d388a..39657fe2c5870a 100644
--- a/llvm/test/CodeGen/WebAssembly/libcalls.ll
+++ b/llvm/test/CodeGen/WebAssembly/libcalls.ll
@@ -32,134 +32,100 @@ define fp128 @fp128libcalls(fp128 %x, fp128 %y, i32 %z) {
 ; CHECK:         .functype fp128libcalls (i32, i64, i64, i64, i64, i32) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push28=, __stack_pointer
-; CHECK-NEXT:    i32.const $push29=, 144
-; CHECK-NEXT:    i32.sub $push73=, $pop28, $pop29
-; CHECK-NEXT:    local.tee $push72=, 6, $pop73
-; CHECK-NEXT:    global.set __stack_pointer, $pop72
+; CHECK-NEXT:    global.get $push18=, __stack_pointer
+; CHECK-NEXT:    i32.const $push19=, 144
+; CHECK-NEXT:    i32.sub $push39=, $pop18, $pop19
+; CHECK-NEXT:    local.tee $push38=, 6, $pop39
+; CHECK-NEXT:    global.set __stack_pointer, $pop38
+; CHECK-NEXT:    local.get $push40=, 6
+; CHECK-NEXT:    i32.const $push36=, 128
+; CHECK-NEXT:    i32.add $push37=, $pop40, $pop36
+; CHECK-NEXT:    local.get $push44=, 1
+; CHECK-NEXT:    local.get $push43=, 2
+; CHECK-NEXT:    local.get $push42=, 3
+; CHECK-NEXT:    local.get $push41=, 4
+; CHECK-NEXT:    call __addtf3, $pop37, $pop44, $pop43, $pop42, $pop41
+; CHECK-NEXT:    local.get $push45=, 6
+; CHECK-NEXT:    i32.const $push34=, 112
+; CHECK-NEXT:    i32.add $push35=, $pop45, $pop34
+; CHECK-NEXT:    local.get $push46=, 6
+; CHECK-NEXT:    i64.load $push1=, 128($pop46)
+; CHECK-NEXT:    local.get $push47=, 6
+; CHECK-NEXT:    i64.load $push0=, 136($pop47)
+; CHECK-NEXT:    local.get $push49=, 3
+; CHECK-NEXT:    local.get $push48=, 4
+; CHECK-NEXT:    call __multf3, $pop35, $pop1, $pop0, $pop49, $pop48
+; CHECK-NEXT:    local.get $push50=, 6
+; CHECK-NEXT:    i32.const $push32=, 96
+; CHECK-NEXT:    i32.add $push33=, $pop50, $pop32
+; CHECK-NEXT:    local.get $push51=, 6
+; CHECK-NEXT:    i64.load $push3=, 112($pop51)
+; CHECK-NEXT:    local.get $push52=, 6
+; CHECK-NEXT:    i64.load $push2=, 120($pop52)
+; CHECK-NEXT:    local.get $push54=, 3
+; CHECK-NEXT:    local.get $push53=, 4
+; CHECK-NEXT:    call __divtf3, $pop33, $pop3, $pop2, $pop54, $pop53
+; CHECK-NEXT:    local.get $push55=, 6
+; CHECK-NEXT:    i32.const $push30=, 80
+; CHECK-NEXT:    i32.add $push31=, $pop55, $pop30
+; CHECK-NEXT:    local.get $push56=, 6
+; CHECK-NEXT:    i64.load $push5=, 96($pop56)
+; CHECK-NEXT:    local.get $push57=, 6
+; CHECK-NEXT:    i64.load $push4=, 104($pop57)
+; CHECK-NEXT:    call sqrtl, $pop31, $pop5, $pop4
+; CHECK-NEXT:    local.get $push58=, 6
+; CHECK-NEXT:    i32.const $push28=, 64
+; CHECK-NEXT:    i32.add $push29=, $pop58, $pop28
+; CHECK-NEXT:    local.get $push59=, 6
+; CHECK-NEXT:    i64.load $push7=, 80($pop59)
+; CHECK-NEXT:    local.get $push60=, 6
+; CHECK-NEXT:    i64.load $push6=, 88($pop60)
+; CHECK-NEXT:    call floorl, $pop29, $pop7, $pop6
+; CHECK-NEXT:    local.get $push61=, 6
+; CHECK-NEXT:    i32.const $push26=, 48
+; CHECK-NEXT:    i32.add $push27=, $pop61, $pop26
+; CHECK-NEXT:    local.get $push62=, 6
+; CHECK-NEXT:    i64.load $push9=, 64($pop62)
+; CHECK-NEXT:    local.get $push63=, 6
+; CHECK-NEXT:    i64.load $push8=, 72($pop63)
+; CHECK-NEXT:    local.get $push65=, 3
+; CHECK-NEXT:    local.get $push64=, 4
+; CHECK-NEXT:    call powl, $pop27, $pop9, $pop8, $pop65, $pop64
+; CHECK-NEXT:    local.get $push66=, 6
+; CHECK-NEXT:    i32.const $push24=, 32
+; CHECK-NEXT:    i32.add $push25=, $pop66, $pop24
+; CHECK-NEXT:    local.get $push67=, 6
+; CHECK-NEXT:    i64.load $push11=, 48($pop67)
+; CHECK-NEXT:    local.get $push68=, 6
+; CHECK-NEXT:    i64.load $push10=, 56($pop68)
+; CHECK-NEXT:    local.get $push69=, 5
+; CHECK-NEXT:    call __powitf2, $pop25, $pop11, $pop10, $pop69
+; CHECK-NEXT:    local.get $push70=, 6
+; CHECK-NEXT:    i32.const $push22=, 16
+; CHECK-NEXT:    i32.add $push23=, $pop70, $pop22
+; CHECK-NEXT:    local.get $push71=, 6
+; CHECK-NEXT:    i64.load $push13=, 32($pop71)
+; CHECK-NEXT:    local.get $push72=, 6
+; CHECK-NEXT:    i64.load $push12=, 40($pop72)
+; CHECK-NEXT:    call truncl, $pop23, $pop13, $pop12
+; CHECK-NEXT:    local.get $push75=, 6
+; CHECK-NEXT:    local.get $push73=, 6
+; CHECK-NEXT:    i64.load $push15=, 16($pop73)
 ; CHECK-NEXT:    local.get $push74=, 6
-; CHECK-NEXT:    i32.const $push62=, 128
-; CHECK-NEXT:    i32.add $push63=, $pop74, $pop62
-; CHECK-NEXT:    local.get $push78=, 1
-; CHECK-NEXT:    local.get $push77=, 2
-; CHECK-NEXT:    local.get $push76=, 3
-; CHECK-NEXT:    local.get $push75=, 4
-; CHECK-NEXT:    call __addtf3, $pop63, $pop78, $pop77, $pop76, $pop75
-; CHECK-NEXT:    local.get $push79=, 6
-; CHECK-NEXT:    i32.const $push58=, 112
-; CHECK-NEXT:    i32.add $push59=, $pop79, $pop58
+; CHECK-NEXT:    i64.load $push14=, 24($pop74)
+; CHECK-NEXT:    call nearbyintl, $pop75, $pop15, $pop14
+; CHECK-NEXT:    local.get $push77=, 0
+; CHECK-NEXT:    local.get $push76=, 6
+; CHECK-NEXT:    i64.load $push16=, 8($pop76)
+; CHECK-NEXT:    i64.store 8($pop77), $pop16
+; CHECK-NEXT:    local.get $push79=, 0
+; CHECK-NEXT:    local.get $push78=, 6
+; CHECK-NEXT:    i64.load $push17=, 0($pop78)
+; CHECK-NEXT:    i64.store 0($pop79), $pop17
 ; CHECK-NEXT:    local.get $push80=, 6
-; CHECK-NEXT:    i64.load $push3=, 128($pop80)
-; CHECK-NEXT:    local.get $push81=, 6
-; CHECK-NEXT:    i32.const $push60=, 128
-; CHECK-NEXT:    i32.add $push61=, $pop81, $pop60
-; CHECK-NEXT:    i32.const $push0=, 8
-; CHECK-NEXT:    i32.add $push1=, $pop61, $pop0
-; CHECK-NEXT:    i64.load $push2=, 0($pop1)
-; CHECK-NEXT:    local.get $push83=, 3
-; CHECK-NEXT:    local.get $push82=, 4
-; CHECK-NEXT:    call __multf3, $pop59, $pop3, $pop2, $pop83, $pop82
-; CHECK-NEXT:    local.get $push84=, 6
-; CHECK-NEXT:    i32.const $push54=, 96
-; CHECK-NEXT:    i32.add $push55=, $pop84, $pop54
-; CHECK-NEXT:    local.get $push85=, 6
-; CHECK-NEXT:    i64.load $push6=, 112($pop85)
-; CHECK-NEXT:    local.get $push86=, 6
-; CHECK-NEXT:    i32.const $push56=, 112
-; CHECK-NEXT:    i32.add $push57=, $pop86, $pop56
-; CHECK-NEXT:    i32.const $push71=, 8
-; CHECK-NEXT:    i32.add $push4=, $pop57, $pop71
-; CHECK-NEXT:    i64.load $push5=, 0($pop4)
-; CHECK-NEXT:    local.get $push88=, 3
-; CHECK-NEXT:    local.get $push87=, 4
-; CHECK-NEXT:    call __divtf3, $pop55, $pop6, $pop5, $pop88, $pop87
-; CHECK-NEXT:    local.get $push89=, 6
-; CHECK-NEXT:    i32.const $push50=, 80
-; CHECK-NEXT:    i32.add $push51=, $pop89, $pop50
-; CHECK-NEXT:    local.get $push90=, 6
-; CHECK-NEXT:    i64.load $push9=, 96($pop90)
-; CHECK-NEXT:    local.get $push91=, 6
-; CHECK-NEXT:    i32.const $push52=, 96
-; CHECK-NEXT:    i32.add $push53=, $pop91, $pop52
-; CHECK-NEXT:    i32.const $push70=, 8
-; CHECK-NEXT:    i32.add $push7=, $pop53, $pop70
-; CHECK-NEXT:    i64.load $push8=, 0($pop7)
-; CHECK-NEXT:    call sqrtl, $pop51, $pop9, $pop8
-; CHECK-NEXT:    local.get $push92=, 6
-; CHECK-NEXT:    i32.const $push46=, 64
-; CHECK-NEXT:    i32.add $push47=, $pop92, $pop46
-; CHECK-NEXT:    local.get $push93=, 6
-; CHECK-NEXT:    i64.load $push12=, 80($pop93)
-; CHECK-NEXT:    local.get $push94=, 6
-; CHECK-NEXT:    i32.const $push48=, 80
-; CHECK-NEXT:    i32.add $push49=, $pop94, $pop48
-; CHECK-NEXT:    i32.const $push69=, 8
-; CHECK-NEXT:    i32.add $push10=, $pop49, $pop69
-; CHECK-NEXT:    i64.load $push11=, 0($pop10)
-; CHECK-NEXT:    call floorl, $pop47, $pop12, $pop11
-; CHECK-NEXT:    local.get $push95=, 6
-; CHECK-NEXT:    i32.const $push42=, 48
-; CHECK-NEXT:    i32.add $push43=, $pop95, $pop42
-; CHECK-NEXT:    local.get $push96=, 6
-; CHECK-NEXT:    i64.load $push15=, 64($pop96)
-; CHECK-NEXT:    local.get $push97=, 6
-; CHECK-NEXT:    i32.const $push44=, 64
-; CHECK-NEXT:    i32.add $push45=, $pop97, $pop44
-; CHECK-NEXT:    i32.const $push68=, 8
-; CHECK-NEXT:    i32.add $push13=, $pop45, $pop68
-; CHECK-NEXT:    i64.load $push14=, 0($pop13)
-; CHECK-NEXT:    local.get $push99=, 3
-; CHECK-NEXT:    local.get $push98=, 4
-; CHECK-NEXT:    call powl, $pop43, $pop15, $pop14, $pop99, $pop98
-; CHECK-NEXT:    local.get $push100=, 6
-; CHECK-NEXT:    i32.const $push38=, 32
-; CHECK-NEXT:    i32.add $push39=, $pop100, $pop38
-; CHECK-NEXT:    local.get $push101=, 6
-; CHECK-NEXT:    i64.load $push18=, 48($pop101)
-; CHECK-NEXT:    local.get $push102=, 6
-; CHECK-NEXT:    i32.const $push40=, 48
-; CHECK-NEXT:    i32.add $push41=, $pop102, $pop40
-; CHECK-NEXT:    i32.const $push67=, 8
-; CHECK-NEXT:    i32.add $push16=, $pop41, $pop67
-; CHECK-NEXT:    i64.load $push17=, 0($pop16)
-; CHECK-NEXT:    local.get $push103=, 5
-; CHECK-NEXT:    call __powitf2, $pop39, $pop18, $pop17, $pop103
-; CHECK-NEXT:    local.get $push104=, 6
-; CHECK-NEXT:    i32.const $push34=, 16
-; CHECK-NEXT:    i32.add $push35=, $pop104, $pop34
-; CHECK-NEXT:    local.get $push105=, 6
-; CHECK-NEXT:    i64.load $push21=, 32($pop105)
-; CHECK-NEXT:    local.get $push106=, 6
-; CHECK-NEXT:    i32.const $push36=, 32
-; CHECK-NEXT:    i32.add $push37=, $pop106, $pop36
-; CHECK-NEXT:    i32.const $push66=, 8
-; CHECK-NEXT:    i32.add $push19=, $pop37, $pop66
-; CHECK-NEXT:    i64.load $push20=, 0($pop19)
-; CHECK-NEXT:    call truncl, $pop35, $pop21, $pop20
-; CHECK-NEXT:    local.get $push109=, 6
-; CHECK-NEXT:    local.get $push107=, 6
-; CHECK-NEXT:    i64.load $push24=, 16($pop107)
-; CHECK-NEXT:    local.get $push108=, 6
-; CHECK-NEXT:    i32.const $push32=, 16
-; CHECK-NEXT:    i32.add $push33=, $pop108, $pop32
-; CHECK-NEXT:    i32.const $push65=, 8
-; CHECK-NEXT:    i32.add $push22=, $pop33, $pop65
-; CHECK-NEXT:    i64.load $push23=, 0($pop22)
-; CHECK-NEXT:    call nearbyintl, $pop109, $pop24, $pop23
-; CHECK-NEXT:    local.get $push111=, 0
-; CHECK-NEXT:    local.get $push110=, 6
-; CHECK-NEXT:    i32.const $push64=, 8
-; CHECK-NEXT:    i32.add $push25=, $pop110, $pop64
-; CHECK-NEXT:    i64.load $push26=, 0($pop25)
-; CHECK-NEXT:    i64.store 8($pop111), $pop26
-; CHECK-NEXT:    local.get $push113=, 0
-; CHECK-NEXT:    local.get $push112=, 6
-; CHECK-NEXT:    i64.load $push27=, 0($pop112)
-; CHECK-NEXT:    i64.store 0($pop113), $pop27
-; CHECK-NEXT:    local.get $push114=, 6
-; CHECK-NEXT:    i32.const $push30=, 144
-; CHECK-NEXT:    i32.add $push31=, $pop114, $pop30
-; CHECK-NEXT:    global.set __stack_pointer, $pop31
+; CHECK-NEXT:    i32.const $push20=, 144
+; CHECK-NEXT:    i32.add $push21=, $pop80, $pop20
+; CHECK-NEXT:    global.set __stack_pointer, $pop21
 ; CHECK-NEXT:    return
   %a = fadd fp128 %x, %y
   %b = fmul fp128 %a, %y
@@ -180,55 +146,49 @@ define i128 @i128libcalls(i128 %x, i128 %y) {
 ; CHECK:         .functype i128libcalls (i32, i64, i64, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32, i64
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push11=, __stack_pointer
-; CHECK-NEXT:    i32.const $push12=, 32
-; CHECK-NEXT:    i32.sub $push23=, $pop11, $pop12
-; CHECK-NEXT:    local.tee $push22=, 5, $pop23
-; CHECK-NEXT:    global.set __stack_pointer, $pop22
-; CHECK-NEXT:    local.get $push24=, 5
-; CHECK-NEXT:    i32.const $push17=, 16
-; CHECK-NEXT:    i32.add $push18=, $pop24, $pop17
-; CHECK-NEXT:    local.get $push26=, 1
-; CHECK-NEXT:    local.get $push25=, 3
-; CHECK-NEXT:    i64.add $push21=, $pop26, $pop25
-; CHECK-NEXT:    local.tee $push20=, 6, $pop21
-; CHECK-NEXT:    local.get $push28=, 2
-; CHECK-NEXT:    local.get $push27=, 4
-; CHECK-NEXT:    i64.add $push0=, $pop28, $pop27
-; CHECK-NEXT:    local.get $push30=, 6
-; CHECK-NEXT:    local.get $push29=, 1
-; CHECK-NEXT:    i64.lt_u $push1=, $pop30, $pop29
+; CHECK-NEXT:    global.get $push8=, __stack_pointer
+; CHECK-NEXT:    i32.const $push9=, 32
+; CHECK-NEXT:    i32.sub $push17=, $pop8, $pop9
+; CHECK-NEXT:    local.tee $push16=, 5, $pop17
+; CHECK-NEXT:    global.set __stack_pointer, $pop16
+; CHECK-NEXT:    local.get $push18=, 5
+; CHECK-NEXT:    i32.const $push12=, 16
+; CHECK-NEXT:    i32.add $push13=, $pop18, $pop12
+; CHECK-NEXT:    local.get $push20=, 1
+; CHECK-NEXT:    local.get $push19=, 3
+; CHECK-NEXT:    i64.add $push15=, $pop20, $pop19
+; CHECK-NEXT:    local.tee $push14=, 6, $pop15
+; CHECK-NEXT:    local.get $push22=, 2
+; CHECK-NEXT:    local.get $push21=, 4
+; CHECK-NEXT:    i64.add $push0=, $pop22, $pop21
+; CHECK-NEXT:    local.get $push24=, 6
+; CHECK-NEXT:    local.get $push23=, 1
+; CHECK-NEXT:    i64.lt_u $push1=, $pop24, $pop23
 ; CHECK-NEXT:    i64.extend_i32_u $push2=, $pop1
 ; CHECK-NEXT:    i64.add $push3=, $pop0, $pop2
-; CHECK-NEXT:    local.get $push32=, 3
-; CHECK-NEXT:    local.get $push31=, 4
-; CHECK-NEXT:    call __multi3, $pop18, $pop20, $pop3, $pop32, $pop31
-; CHECK-NEXT:    local.get $push37=, 5
-; CHECK-NEXT:    local.get $push33=, 5
-; CHECK-NEXT:    i64.load $push7=, 16($pop33)
+; CHECK-NEXT:    local.get $push26=, 3
+; CHECK-NEXT:    local.get $push25=, 4
+; CHECK-NEXT:    call __multi3, $pop13, $pop14, $pop3, $pop26, $pop25
+; CHECK-NEXT:    local.get $push31=, 5
+; CHECK-NEXT:    local.get $push27=, 5
+; CHECK-NEXT:    i64.load $push5=, 16($pop27)
+; CHECK-NEXT:    local.get $push28=, 5
+; CHECK-NEXT:    i64.load $push4=, 24($pop28)
+; CHECK-NEXT:    local.get $push30=, 3
+; CHECK-NEXT:    local.get $push29=, 4
+; CHECK-NEXT:    call __umodti3, $pop31, $pop5, $pop4, $pop30, $pop29
+; CHECK-NEXT:    local.get $push33=, 0
+; CHECK-NEXT:    local.get $push32=, 5
+; CHECK-NEXT:    i64.load $push6=, 8($pop32)
+; CHECK-NEXT:    i64.store 8($pop33), $pop6
+; CHECK-NEXT:    local.get $push35=, 0
 ; CHECK-NEXT:    local.get $push34=, 5
-; CHECK-NEXT:    i32.const $push15=, 16
-; CHECK-NEXT:    i32.add $push16=, $pop34, $pop15
-; CHECK-NEXT:    i32.const $push4=, 8
-; CHECK-NEXT:    i32.add $push5=, $pop16, $pop4
-; CHECK-NEXT:    i64.load $push6=, 0($pop5)
-; CHECK-NEXT:    local.get $push36=, 3
-; CHECK-NEXT:    local.get $push35=, 4
-; CHECK-NEXT:    call __umodti3, $pop37, $pop7, $pop6, $pop36, $pop35
-; CHECK-NEXT:    local.get $push39=, 0
-; CHECK-NEXT:    local.get $push38=, 5
-; CHECK-NEXT:    i32.const $push19=, 8
-; CHECK-NEXT:    i32.add $push8=, $pop38, $pop19
-; CHECK-NEXT:    i64.load $push9=, 0($pop8)
-; CHECK-NEXT:    i64.store 8($pop39), $pop9
-; CHECK-NEXT:    local.get $push41=, 0
-; CHECK-NEXT:    local.get $push40=, 5
-; CHECK-NEXT:    i64.load $push10=, 0($pop40)
-; CHECK-NEXT:    i64.store 0($pop41), $pop10
-; CHECK-NEXT:    local.get $push42=, 5
-; CHECK-NEXT:    i32.const $push13=, 32
-; CHECK-NEXT:    i32.add $push14=, $pop42, $pop13
-; CHECK-NEXT:    global.set __stack_pointer, $pop14
+; CHECK-NEXT:    i64.load $push7=, 0($pop34)
+; CHECK-NEXT:    i64.store 0($pop35), $pop7
+; CHECK-NEXT:    local.get $push36=, 5
+; CHECK-NEXT:    i32.const $push10=, 32
+; CHECK-NEXT:    i32.add $push11=, $pop36, $pop10
+; CHECK-NEXT:    global.set __stack_pointer, $pop11
 ; CHECK-NEXT:    return
   %a = add i128 %x, %y
   %b = mul i128 %a, %y
@@ -243,7 +203,7 @@ define double @f64libcalls(double %x, double %y, i32 %z) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    global.get $push12=, __stack_pointer
 ; CHECK-NEXT:    i32.const $push13=, 16
-; CHECK-NEXT:    i32.sub  $push19=, $pop12, $pop13
+; CHECK-NEXT:    i32.sub $push19=, $pop12, $pop13
 ; CHECK-NEXT:    local.tee $push18=, 3, $pop19
 ; CHECK-NEXT:    global.set __stack_pointer, $pop18
 ; CHECK-NEXT:    local.get $push23=, 0
@@ -263,7 +223,7 @@ define double @f64libcalls(double %x, double %y, i32 %z) {
 ; CHECK-NEXT:    call $push10=, ldexp, $pop23, $pop9
 ; CHECK-NEXT:    local.get $push24=, 3
 ; CHECK-NEXT:    i32.const $push16=, 12
-; CHECK-NEXT:    i32.add  $push17=, $pop24, $pop16
+; CHECK-NEXT:    i32.add $push17=, $pop24, $pop16
 ; CHECK-NEXT:    call $push25=, frexp, $pop10, $pop17
 ; CHECK-NEXT:    local.set 0, $pop25
 ; CHECK-NEXT:    local.get $push26=, 3
@@ -271,7 +231,7 @@ define double @f64libcalls(double %x, double %y, i32 %z) {
 ; CHECK-NEXT:    call escape_value, $pop11
 ; CHECK-NEXT:    local.get $push27=, 3
 ; CHECK-NEXT:    i32.const $push14=, 16
-; CHECK-NEXT:    i32.add  $push15=, $pop27, $pop14
+; CHECK-NEXT:    i32.add $push15=, $pop27, $pop14
 ; CHECK-NEXT:    global.set __stack_pointer, $pop15
 ; CHECK-NEXT:    local.get $push28=, 0
 ; CHECK-NEXT:    return $pop28
diff --git a/llvm/test/CodeGen/WebAssembly/multi-return.ll b/llvm/test/CodeGen/WebAssembly/multi-return.ll
index 293a1b35c39c6a..9ed9edb4775ce6 100644
--- a/llvm/test/CodeGen/WebAssembly/multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/multi-return.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
 
 target triple = "wasm32-unknown-unknown"
 
@@ -9,9 +10,22 @@ declare { i64, i128, i192, i128, i64 } @return_multi_multi()
 
 define i64 @test0() {
 ; CHECK-LABEL: test0:
-; CHECK: call    	return_multi_multi
-; CHECK: i64.load	$[[RV:[0-9]+]]=, 8(${{[0-9]+}})
-; CHECK: local.copy	$push8=, $[[RV]]
+; CHECK:         .functype test0 () -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-NEXT:    i32.const $push1=, 80
+; CHECK-NEXT:    i32.sub $push7=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push6=, $1=, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    i32.const $push4=, 8
+; CHECK-NEXT:    i32.add $push5=, $1, $pop4
+; CHECK-NEXT:    call return_multi_multi, $pop5
+; CHECK-NEXT:    i64.load $0=, 8($1)
+; CHECK-NEXT:    i32.const $push2=, 80
+; CHECK-NEXT:    i32.add $push3=, $1, $pop2
+; CHECK-NEXT:    global.set __stack_pointer, $pop3
+; CHECK-NEXT:    local.copy $push8=, $0
+; CHECK-NEXT:    # fallthrough-return
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %t1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   ret i64 %t1
@@ -19,13 +33,24 @@ define i64 @test0() {
 
 define i128 @test1() {
 ; CHECK-LABEL: test1:
-; CHECK: call    	return_multi_multi
-; CHECK: i64.load	$[[RV:[0-9]+]]=, 16($[[SP:[0-9]+]])
-; CHECK: i32.const	$push0=, 24
-; CHECK: i32.add 	$push1=, $[[SP]], $pop0
-; CHECK: i64.load	$push2=, 0($pop1)
-; CHECK: i64.store	8($0), $pop2
-; CHECK: i64.store	0($0), $[[RV]]
+; CHECK:         .functype test1 (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push1=, __stack_pointer
+; CHECK-NEXT:    i32.const $push2=, 80
+; CHECK-NEXT:    i32.sub $push8=, $pop1, $pop2
+; CHECK-NEXT:    local.tee $push7=, $2=, $pop8
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.const $push5=, 8
+; CHECK-NEXT:    i32.add $push6=, $2, $pop5
+; CHECK-NEXT:    call return_multi_multi, $pop6
+; CHECK-NEXT:    i64.load $1=, 16($2)
+; CHECK-NEXT:    i64.load $push0=, 24($2)
+; CHECK-NEXT:    i64.store 8($0), $pop0
+; CHECK-NEXT:    i64.store 0($0), $1
+; CHECK-NEXT:    i32.const $push3=, 80
+; CHECK-NEXT:    i32.add $push4=, $2, $pop3
+; CHECK-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-NEXT:    # fallthrough-return
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %t1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
   ret i128 %t1
@@ -33,17 +58,28 @@ define i128 @test1() {
 
 define i192 @test2() {
 ; CHECK-LABEL: test2:
-; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push0=, 40
-; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
-; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i64.load	$[[L2:[0-9]+]]=, 32($[[SP]])
-; CHECK: i32.const	$push2=, 48
-; CHECK: i32.add 	$push3=, $[[SP]], $pop2
-; CHECK: i64.load	$push4=, 0($pop3)
-; CHECK: i64.store	16($0), $pop4
-; CHECK: i64.store	0($0), $[[L2]]
-; CHECK: i64.store	8($0), $[[L1]]
+; CHECK:         .functype test2 (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-NEXT:    i32.const $push4=, 80
+; CHECK-NEXT:    i32.sub $push10=, $pop3, $pop4
+; CHECK-NEXT:    local.tee $push9=, $3=, $pop10
+; CHECK-NEXT:    global.set __stack_pointer, $pop9
+; CHECK-NEXT:    i32.const $push7=, 8
+; CHECK-NEXT:    i32.add $push8=, $3, $pop7
+; CHECK-NEXT:    call return_multi_multi, $pop8
+; CHECK-NEXT:    i64.load $1=, 40($3)
+; CHECK-NEXT:    i64.load $2=, 32($3)
+; CHECK-NEXT:    i32.const $push0=, 48
+; CHECK-NEXT:    i32.add $push1=, $3, $pop0
+; CHECK-NEXT:    i64.load $push2=, 0($pop1)
+; CHECK-NEXT:    i64.store 16($0), $pop2
+; CHECK-NEXT:    i64.store 0($0), $2
+; CHECK-NEXT:    i64.store 8($0), $1
+; CHECK-NEXT:    i32.const $push5=, 80
+; CHECK-NEXT:    i32.add $push6=, $3, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    # fallthrough-return
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %t1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 2
   ret i192 %t1
@@ -51,13 +87,24 @@ define i192 @test2() {
 
 define i128 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK: call    	return_multi_multi
-; CHECK: i64.load	$[[L1:[0-9]+]]=, 56($[[SP:[0-9]+]])
-; CHECK: i32.const	$push0=, 64
-; CHECK: i32.add 	$push1=, $[[SP]], $pop0
-; CHECK: i64.load	$push2=, 0($pop1)
-; CHECK: i64.store	8($0), $pop2
-; CHECK: i64.store	0($0), $[[L1]]
+; CHECK:         .functype test3 (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push1=, __stack_pointer
+; CHECK-NEXT:    i32.const $push2=, 80
+; CHECK-NEXT:    i32.sub $push8=, $pop1, $pop2
+; CHECK-NEXT:    local.tee $push7=, $2=, $pop8
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.const $push5=, 8
+; CHECK-NEXT:    i32.add $push6=, $2, $pop5
+; CHECK-NEXT:    call return_multi_multi, $pop6
+; CHECK-NEXT:    i64.load $1=, 56($2)
+; CHECK-NEXT:    i64.load $push0=, 64($2)
+; CHECK-NEXT:    i64.store 8($0), $pop0
+; CHECK-NEXT:    i64.store 0($0), $1
+; CHECK-NEXT:    i32.const $push3=, 80
+; CHECK-NEXT:    i32.add $push4=, $2, $pop3
+; CHECK-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-NEXT:    # fallthrough-return
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %t1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 3
   ret i128 %t1
@@ -65,9 +112,22 @@ define i128 @test3() {
 
 define i64 @test4() {
 ; CHECK-LABEL: test4:
-; CHECK: call    	return_multi_multi
-; CHECK: i64.load	$[[L1:[0-9]+]]=, 72($[[SP:[0-9]+]])
-; CHECK: local.copy	$push8=, $[[L1]]
+; CHECK:         .functype test4 () -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-NEXT:    i32.const $push1=, 80
+; CHECK-NEXT:    i32.sub $push7=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push6=, $1=, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    i32.const $push4=, 8
+; CHECK-NEXT:    i32.add $push5=, $1, $pop4
+; CHECK-NEXT:    call return_multi_multi, $pop5
+; CHECK-NEXT:    i64.load $0=, 72($1)
+; CHECK-NEXT:    i32.const $push2=, 80
+; CHECK-NEXT:    i32.add $push3=, $1, $pop2
+; CHECK-NEXT:    global.set __stack_pointer, $pop3
+; CHECK-NEXT:    local.copy $push8=, $0
+; CHECK-NEXT:    # fallthrough-return
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %t1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 4
   ret i64 %t1
@@ -77,17 +137,26 @@ define i64 @test4() {
 
 define { i64, i128 } @test5() {
 ; CHECK-LABEL: test5:
-; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push0=, 24
-; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
-; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
-; CHECK: i64.load	$push2=, 16($[[SP]])
-; CHECK: i64.store	8($0), $pop2
-; CHECK: i64.store	16($0), $[[L1]]
-; CHECK: i64.store	0($0), $[[L2]]
-; CHECK: i32.const	$push5=, 80
-; CHECK: i32.add 	$push6=, $3, $pop5
+; CHECK:         .functype test5 (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push1=, __stack_pointer
+; CHECK-NEXT:    i32.const $push2=, 80
+; CHECK-NEXT:    i32.sub $push8=, $pop1, $pop2
+; CHECK-NEXT:    local.tee $push7=, $3=, $pop8
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.const $push5=, 8
+; CHECK-NEXT:    i32.add $push6=, $3, $pop5
+; CHECK-NEXT:    call return_multi_multi, $pop6
+; CHECK-NEXT:    i64.load $1=, 8($3)
+; CHECK-NEXT:    i64.load $2=, 24($3)
+; CHECK-NEXT:    i64.load $push0=, 16($3)
+; CHECK-NEXT:    i64.store 8($0), $pop0
+; CHECK-NEXT:    i64.store 16($0), $2
+; CHECK-NEXT:    i64.store 0($0), $1
+; CHECK-NEXT:    i32.const $push3=, 80
+; CHECK-NEXT:    i32.add $push4=, $3, $pop3
+; CHECK-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-NEXT:    # fallthrough-return
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
@@ -98,21 +167,28 @@ define { i64, i128 } @test5() {
 
 define { i128, i128 } @test6() {
 ; CHECK-LABEL: test6:
-; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push0=, 24
-; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
-; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 64
-; CHECK: i32.add 	$push3=, $[[SP]], $pop2
-; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i64.load	$[[L3:[0-9]+]]=, 16($[[SP]])
-; CHECK: i64.load	$push4=, 56($[[SP]])
-; CHECK: i64.store	16($0), $pop4
-; CHECK: i64.store	24($0), $[[L2]]
-; CHECK: i64.store	0($0), $[[L3]]
-; CHECK: i64.store	8($0), $[[L1]]
-; CHECK: i32.const	$push7=, 80
-; CHECK: i32.add	$push8=, $4, $pop7
+; CHECK:         .functype test6 (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push1=, __stack_pointer
+; CHECK-NEXT:    i32.const $push2=, 80
+; CHECK-NEXT:    i32.sub $push8=, $pop1, $pop2
+; CHECK-NEXT:    local.tee $push7=, $4=, $pop8
+; CHECK-NEXT:    global.set __stack_pointer, $pop7
+; CHECK-NEXT:    i32.const $push5=, 8
+; CHECK-NEXT:    i32.add $push6=, $4, $pop5
+; CHECK-NEXT:    call return_multi_multi, $pop6
+; CHECK-NEXT:    i64.load $1=, 24($4)
+; CHECK-NEXT:    i64.load $2=, 16($4)
+; CHECK-NEXT:    i64.load $3=, 64($4)
+; CHECK-NEXT:    i64.load $push0=, 56($4)
+; CHECK-NEXT:    i64.store 16($0), $pop0
+; CHECK-NEXT:    i64.store 24($0), $3
+; CHECK-NEXT:    i64.store 0($0), $2
+; CHECK-NEXT:    i64.store 8($0), $1
+; CHECK-NEXT:    i32.const $push3=, 80
+; CHECK-NEXT:    i32.add $push4=, $4, $pop3
+; CHECK-NEXT:    global.set __stack_pointer, $pop4
+; CHECK-NEXT:    # fallthrough-return
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
   %r3 = extractvalue { i64, i128, i192, i128, i64 } %t0, 3
@@ -123,21 +199,30 @@ define { i128, i128 } @test6() {
 
 define { i64, i192 } @test7() {
 ; CHECK-LABEL: test7:
-; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push0=, 40
-; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
-; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
-; CHECK: i64.load	$[[L3:[0-9]+]]=, 32($[[SP]])
-; CHECK: i32.const	$push2=, 48
-; CHECK: i32.add 	$push3=, $[[SP]], $pop2
-; CHECK: i64.load	$push4=, 0($pop3)
-; CHECK: i64.store	24($0), $pop4
-; CHECK: i64.store	8($0), $[[L3]]
-; CHECK: i64.store	16($0), $[[L1]]
-; CHECK: i64.store	0($0), $[[L2]]
-; CHECK: i32.const	$push7=, 80
-; CHECK: i32.add 	$push8=, $4, $pop7
+; CHECK:         .functype test7 (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-NEXT:    i32.const $push4=, 80
+; CHECK-NEXT:    i32.sub $push10=, $pop3, $pop4
+; CHECK-NEXT:    local.tee $push9=, $4=, $pop10
+; CHECK-NEXT:    global.set __stack_pointer, $pop9
+; CHECK-NEXT:    i32.const $push7=, 8
+; CHECK-NEXT:    i32.add $push8=, $4, $pop7
+; CHECK-NEXT:    call return_multi_multi, $pop8
+; CHECK-NEXT:    i64.load $1=, 8($4)
+; CHECK-NEXT:    i64.load $2=, 40($4)
+; CHECK-NEXT:    i64.load $3=, 32($4)
+; CHECK-NEXT:    i32.const $push0=, 48
+; CHECK-NEXT:    i32.add $push1=, $4, $pop0
+; CHECK-NEXT:    i64.load $push2=, 0($pop1)
+; CHECK-NEXT:    i64.store 24($0), $pop2
+; CHECK-NEXT:    i64.store 8($0), $3
+; CHECK-NEXT:    i64.store 16($0), $2
+; CHECK-NEXT:    i64.store 0($0), $1
+; CHECK-NEXT:    i32.const $push5=, 80
+; CHECK-NEXT:    i32.add $push6=, $4, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    # fallthrough-return
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r2 = extractvalue { i64, i128, i192, i128, i64 } %t0, 2
@@ -148,33 +233,38 @@ define { i64, i192 } @test7() {
 
 define { i128, i192, i128, i64 } @test8() {
 ; CHECK-LABEL: test8:
-; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push0=, 64
-; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
-; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 40
-; CHECK: i32.add 	$push3=, $[[SP]], $pop2
-; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i32.const	$push4=, 48
-; CHECK: i32.add 	$push5=, $[[SP]], $pop4
-; CHECK: i64.load	$[[L3:[0-9]+]]=, 0($pop5)
-; CHECK: i32.const	$push6=, 24
-; CHECK: i32.add 	$push7=, $[[SP]], $pop6
-; CHECK: i64.load	$[[L4:[0-9]+]]=, 0($pop7)
-; CHECK: i64.load	$[[L5:[0-9]+]]=, 8($[[SP]])
-; CHECK: i64.load	$[[L6:[0-9]+]]=, 56($[[SP]])
-; CHECK: i64.load	$[[L7:[0-9]+]]=, 32($[[SP]])
-; CHECK: i64.load	$push8=, 16($[[SP]])
-; CHECK: i64.store	40($0), $pop8
-; CHECK: i64.store	48($0), $[[L4]]
-; CHECK: i64.store	32($0), $[[L3]]
-; CHECK: i64.store	16($0), $[[L7]]
-; CHECK: i64.store	24($0), $[[L2]]
-; CHECK: i64.store	0($0), $[[L6]]
-; CHECK: i64.store	8($0), $[[L1]]
-; CHECK: i64.store	56($0), $[[L5]]
-; CHECK: i32.const	$push11=, 80
-; CHECK: i32.add 	$push12=, $8, $pop11
+; CHECK:         .functype test8 (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    global.get $push3=, __stack_pointer
+; CHECK-NEXT:    i32.const $push4=, 80
+; CHECK-NEXT:    i32.sub $push10=, $pop3, $pop4
+; CHECK-NEXT:    local.tee $push9=, $8=, $pop10
+; CHECK-NEXT:    global.set __stack_pointer, $pop9
+; CHECK-NEXT:    i32.const $push7=, 8
+; CHECK-NEXT:    i32.add $push8=, $8, $pop7
+; CHECK-NEXT:    call return_multi_multi, $pop8
+; CHECK-NEXT:    i32.const $push0=, 48
+; CHECK-NEXT:    i32.add $push1=, $8, $pop0
+; CHECK-NEXT:    i64.load $1=, 0($pop1)
+; CHECK-NEXT:    i64.load $2=, 8($8)
+; CHECK-NEXT:    i64.load $3=, 64($8)
+; CHECK-NEXT:    i64.load $4=, 56($8)
+; CHECK-NEXT:    i64.load $5=, 40($8)
+; CHECK-NEXT:    i64.load $6=, 32($8)
+; CHECK-NEXT:    i64.load $7=, 24($8)
+; CHECK-NEXT:    i64.load $push2=, 16($8)
+; CHECK-NEXT:    i64.store 40($0), $pop2
+; CHECK-NEXT:    i64.store 48($0), $7
+; CHECK-NEXT:    i64.store 32($0), $1
+; CHECK-NEXT:    i64.store 16($0), $6
+; CHECK-NEXT:    i64.store 24($0), $5
+; CHECK-NEXT:    i64.store 0($0), $4
+; CHECK-NEXT:    i64.store 8($0), $3
+; CHECK-NEXT:    i64.store 56($0), $2
+; CHECK-NEXT:    i32.const $push5=, 80
+; CHECK-NEXT:    i32.add $push6=, $8, $pop5
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    # fallthrough-return
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
index 2958b115df9d3e..c1343d32f80e56 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
@@ -34,9 +34,7 @@ define i128 @multivalue_sdiv(i128 %a, i128 %b) {
 ; NO_MULTIVALUE-NEXT:    call __divti3
 ; NO_MULTIVALUE-NEXT:    local.get 0
 ; NO_MULTIVALUE-NEXT:    local.get 5
-; NO_MULTIVALUE-NEXT:    i32.const 8
-; NO_MULTIVALUE-NEXT:    i32.add
-; NO_MULTIVALUE-NEXT:    i64.load 0
+; NO_MULTIVALUE-NEXT:    i64.load 8
 ; NO_MULTIVALUE-NEXT:    i64.store 8
 ; NO_MULTIVALUE-NEXT:    local.get 0
 ; NO_MULTIVALUE-NEXT:    local.get 5
@@ -80,9 +78,7 @@ define fp128 @multivalue_fsub(fp128 %a, fp128 %b) {
 ; NO_MULTIVALUE-NEXT:    call __subtf3
 ; NO_MULTIVALUE-NEXT:    local.get 0
 ; NO_MULTIVALUE-NEXT:    local.get 5
-; NO_MULTIVALUE-NEXT:    i32.const 8
-; NO_MULTIVALUE-NEXT:    i32.add
-; NO_MULTIVALUE-NEXT:    i64.load 0
+; NO_MULTIVALUE-NEXT:    i64.load 8
 ; NO_MULTIVALUE-NEXT:    i64.store 8
 ; NO_MULTIVALUE-NEXT:    local.get 0
 ; NO_MULTIVALUE-NEXT:    local.get 5
@@ -125,9 +121,7 @@ define i128 @multivalue_lshr(i128 %a, i128 %b) {
 ; NO_MULTIVALUE-NEXT:    call __ashlti3
 ; NO_MULTIVALUE-NEXT:    local.get 0
 ; NO_MULTIVALUE-NEXT:    local.get 5
-; NO_MULTIVALUE-NEXT:    i32.const 8
-; NO_MULTIVALUE-NEXT:    i32.add
-; NO_MULTIVALUE-NEXT:    i64.load 0
+; NO_MULTIVALUE-NEXT:    i64.load 8
 ; NO_MULTIVALUE-NEXT:    i64.store 8
 ; NO_MULTIVALUE-NEXT:    local.get 0
 ; NO_MULTIVALUE-NEXT:    local.get 5
diff --git a/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll
index 90e1a176e38f9c..110fb2d43580aa 100644
--- a/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll
@@ -3,78 +3,76 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; WASM32-LABEL: muloti_test
-; WASM32: global.get      $push18=, __stack_pointer
-; WASM32: i32.const       $push19=, 48
-; WASM32: i32.sub         $push40=, $pop18, $pop19
-; WASM32: local.tee       $push39=, 5, $pop40
-; WASM32: global.set      __stack_pointer, $pop39
-; WASM32: local.get       $push41=, 5
-; WASM32: i32.const       $push24=, 32
-; WASM32: i32.add         $push25=, $pop41, $pop24
-; WASM32: local.get       $push43=, 1
+; WASM32: global.get      $push16=, __stack_pointer
+; WASM32: i32.const       $push17=, 48
+; WASM32: i32.sub         $push38=, $pop16, $pop17
+; WASM32: local.tee       $push37=, 5, $pop38
+; WASM32: global.set      __stack_pointer, $pop37
+; WASM32: local.get       $push39=, 5
+; WASM32: i32.const       $push22=, 32
+; WASM32: i32.add         $push23=, $pop39, $pop22
+; WASM32: local.get       $push41=, 1
 ; WASM32: i64.const       $push0=, 0
-; WASM32: local.get       $push42=, 3
-; WASM32: i64.const       $push38=, 0
-; WASM32: call            __multi3, $pop25, $pop43, $pop0, $pop42, $pop38
-; WASM32: local.get       $push44=, 5
-; WASM32: i32.const       $push22=, 16
-; WASM32: i32.add         $push23=, $pop44, $pop22
-; WASM32: local.get       $push46=, 4
-; WASM32: i64.const       $push37=, 0
-; WASM32: local.get       $push45=, 1
+; WASM32: local.get       $push40=, 3
 ; WASM32: i64.const       $push36=, 0
-; WASM32: call            __multi3, $pop23, $pop46, $pop37, $pop45, $pop36
-; WASM32: local.get       $push49=, 5
-; WASM32: local.get       $push48=, 2
+; WASM32: call __multi3,  $pop23, $pop41, $pop0, $pop40, $pop36
+; WASM32: local.get       $push42=, 5
+; WASM32: i32.const       $push20=, 16
+; WASM32: i32.add         $push21=, $pop42, $pop20
+; WASM32: local.get       $push44=, 4
 ; WASM32: i64.const       $push35=, 0
-; WASM32: local.get       $push47=, 3
+; WASM32: local.get       $push43=, 1
 ; WASM32: i64.const       $push34=, 0
-; WASM32: call            __multi3, $pop49, $pop48, $pop35, $pop47, $pop34
-; WASM32: local.get       $push51=, 0
+; WASM32: call __multi3,  $pop21, $pop44, $pop35, $pop43, $pop34
+; WASM32: local.get       $push47=, 5
+; WASM32: local.get       $push46=, 2
+; WASM32: i64.const       $push33=, 0
+; WASM32: local.get       $push45=, 3
+; WASM32: i64.const       $push32=, 0
+; WASM32: call __multi3,  $pop47, $pop46, $pop33, $pop45, $pop32
+; WASM32: local.get       $push49=, 0
+; WASM32: local.get       $push48=, 5
+; WASM32: i64.load        $push1=, 32($pop48)
+; WASM32: i64.store       0($pop49), $pop1
+; WASM32: local.get       $push53=, 0
 ; WASM32: local.get       $push50=, 5
-; WASM32: i64.load        $push1=, 32($pop50)
-; WASM32: i64.store       0($pop51), $pop1
-; WASM32: local.get       $push55=, 0
+; WASM32: i64.load        $push31=, 40($pop50)
+; WASM32: local.tee       $push30=, 3, $pop31
+; WASM32: local.get       $push51=, 5
+; WASM32: i64.load        $push3=, 0($pop51)
 ; WASM32: local.get       $push52=, 5
-; WASM32: i32.const       $push5=, 40
-; WASM32: i32.add         $push6=, $pop52, $pop5
-; WASM32: i64.load        $push33=, 0($pop6)
-; WASM32: local.tee       $push32=, 3, $pop33
-; WASM32: local.get       $push53=, 5
-; WASM32: i64.load        $push3=, 0($pop53)
-; WASM32: local.get       $push54=, 5
-; WASM32: i64.load        $push2=, 16($pop54)
+; WASM32: i64.load        $push2=, 16($pop52)
 ; WASM32: i64.add         $push4=, $pop3, $pop2
-; WASM32: i64.add         $push31=, $pop32, $pop4
-; WASM32: local.tee       $push30=, 1, $pop31
-; WASM32: i64.store       8($pop55), $pop30
-; WASM32: local.get       $push62=, 0
-; WASM32: local.get       $push56=, 2
-; WASM32: i64.const       $push29=, 0
-; WASM32: i64.ne          $push8=, $pop56, $pop29
-; WASM32: local.get       $push57=, 4
-; WASM32: i64.const       $push28=, 0
-; WASM32: i64.ne          $push7=, $pop57, $pop28
-; WASM32: i32.and         $push9=, $pop8, $pop7
-; WASM32: local.get       $push58=, 5
-; WASM32: i64.load        $push10=, 8($pop58)
+; WASM32: i64.add         $push29=, $pop30, $pop4
+; WASM32: local.tee       $push28=, 1, $pop29
+; WASM32: i64.store       8($pop53), $pop28
+; WASM32: local.get       $push60=, 0
+; WASM32: local.get       $push54=, 2
 ; WASM32: i64.const       $push27=, 0
-; WASM32: i64.ne          $push11=, $pop10, $pop27
-; WASM32: i32.or          $push12=, $pop9, $pop11
-; WASM32: local.get       $push59=, 5
-; WASM32: i64.load        $push13=, 24($pop59)
+; WASM32: i64.ne          $push6=, $pop54, $pop27
+; WASM32: local.get       $push55=, 4
 ; WASM32: i64.const       $push26=, 0
-; WASM32: i64.ne          $push14=, $pop13, $pop26
-; WASM32: i32.or          $push15=, $pop12, $pop14
-; WASM32: local.get       $push61=, 1
-; WASM32: local.get       $push60=, 3
-; WASM32: i64.lt_u        $push16=, $pop61, $pop60
-; WASM32: i32.or          $push17=, $pop15, $pop16
-; WASM32: i32.store8      16($pop62), $pop17
-; WASM32: local.get       $push63=, 5
-; WASM32: i32.const       $push20=, 48
-; WASM32: i32.add         $push21=, $pop63, $pop20
-; WASM32: global.set      __stack_pointer, $pop21
+; WASM32: i64.ne          $push5=, $pop55, $pop26
+; WASM32: i32.and         $push7=, $pop6, $pop5
+; WASM32: local.get       $push56=, 5
+; WASM32: i64.load        $push8=, 8($pop56)
+; WASM32: i64.const       $push25=, 0
+; WASM32: i64.ne          $push9=, $pop8, $pop25
+; WASM32: i32.or          $push10=, $pop7, $pop9
+; WASM32: local.get       $push57=, 5
+; WASM32: i64.load        $push11=, 24($pop57)
+; WASM32: i64.const       $push24=, 0
+; WASM32: i64.ne          $push12=, $pop11, $pop24
+; WASM32: i32.or          $push13=, $pop10, $pop12
+; WASM32: local.get       $push59=, 1
+; WASM32: local.get       $push58=, 3
+; WASM32: i64.lt_u        $push14=, $pop59, $pop58
+; WASM32: i32.or          $push15=, $pop13, $pop14
+; WASM32: i32.store8      16($pop60), $pop15
+; WASM32: local.get       $push61=, 5
+; WASM32: i32.const       $push18=, 48
+; WASM32: i32.add         $push19=, $pop61, $pop18
+; WASM32: global.set      __stack_pointer, $pop19
 
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2

>From ce3717bf3967eef38728a6738cb6bbaf40dab5f3 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dev at sunfishcode.online>
Date: Mon, 9 Dec 2024 23:08:42 -0800
Subject: [PATCH 2/2] Update CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll.

---
 .../AMDGPU/buffer-intrinsics-mmo-offsets.ll   | 291 +++++++++---------
 1 file changed, 144 insertions(+), 147 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index 84ea2beb8d04bd..384beae07ce2ea 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -13,122 +13,119 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
   ; GCN-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg0, addrspace 6)
   ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg0, align 16, addrspace 6)
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-  ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 8
-  ; GCN-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
-  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE killed [[S_ADD_I32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
-  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[REG_SEQUENCE1]], 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg0 + 8, basealign 16, addrspace 6)
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
-  ; GCN-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, killed [[COPY4]], %subreg.sub3
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[REG_SEQUENCE]], 8, 0 :: (dereferenceable invariant load (s64) from %ir.arg0 + 8, basealign 16, addrspace 6)
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[COPY3]], %subreg.sub2, killed [[COPY2]], %subreg.sub3
   ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 80
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 160
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY10]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_6]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[COPY11]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[COPY11]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 88
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 176
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[S_MOV_B32_8]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE1]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE1]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE2]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE1]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE1]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE1]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY17:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE2]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE1]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 112
+  ; GCN-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
   ; GCN-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 224
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224
   ; GCN-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
@@ -137,27 +134,27 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
   ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE2]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE1]], [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE1]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE2]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE1]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 120
+  ; GCN-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
   ; GCN-NEXT:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 240
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240
   ; GCN-NEXT:   [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
@@ -165,26 +162,26 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
   ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE2]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE1]], [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE1]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY37:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE2]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE1]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 256
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256
   ; GCN-NEXT:   [[COPY40:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY40]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY40]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY42:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY41]], [[S_LOAD_DWORDX4_IMM]], [[COPY42]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
@@ -192,54 +189,54 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
   ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY43]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY43]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY44]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY44]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY45]], [[REG_SEQUENCE2]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY45]], [[REG_SEQUENCE1]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY47:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY46]], [[REG_SEQUENCE2]], [[COPY47]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY46]], [[REG_SEQUENCE1]], [[COPY47]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY48:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY48]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 136
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY48]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136
   ; GCN-NEXT:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 272
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272
   ; GCN-NEXT:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY52:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[COPY52]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[COPY52]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY53:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY53]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY53]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY54:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY54]], [[REG_SEQUENCE2]], [[S_MOV_B32_19]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY54]], [[REG_SEQUENCE1]], [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY55:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY55]], [[REG_SEQUENCE2]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY55]], [[REG_SEQUENCE1]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY56:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY57:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY56]], [[REG_SEQUENCE2]], [[COPY57]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY56]], [[REG_SEQUENCE1]], [[COPY57]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY58:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 288
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288
   ; GCN-NEXT:   [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY61:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY62:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
@@ -247,44 +244,44 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
   ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY63:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[COPY63]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[COPY63]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY64:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY64]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY64]], [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY65]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY65]], [[REG_SEQUENCE1]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN1]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY66:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY67:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY66]], [[REG_SEQUENCE2]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY66]], [[REG_SEQUENCE1]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; GCN-NEXT:   [[COPY68:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 152
+  ; GCN-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
   ; GCN-NEXT:   [[COPY69:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sreg_32 = S_MOV_B32 304
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304
   ; GCN-NEXT:   [[COPY70:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY71:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY72:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY73:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[COPY73]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[COPY73]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY74:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY74]], [[REG_SEQUENCE2]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY74]], [[REG_SEQUENCE1]], [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY75:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY75]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY75]], [[REG_SEQUENCE1]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   [[COPY76:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN-NEXT:   [[COPY77:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY76]], [[REG_SEQUENCE2]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
-  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY76]], [[REG_SEQUENCE1]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+  ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
   ; GCN-NEXT:   S_ENDPGM 0
 bb.0:
   %tmp0 = load <4 x i32>, ptr addrspace(6) %arg0, align 16, !invariant.load !0



More information about the llvm-commits mailing list