[llvm] AMDGPU: Add tests for ds_write2 formation with agprs (PR #155765)

Thu Aug 28 00:06:24 PDT 2025

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/155765

>From e600683ee0e85d7e33e3d1050a7dc4094cc1b2a6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 28 Aug 2025 15:03:04 +0900
Subject: [PATCH] AMDGPU: Add tests for ds_write2 formation with agprs

The current handling for write2 formation is overly conservative
and cannot form write2s with AGPR inputs.
---
 llvm/test/CodeGen/AMDGPU/a-v-ds-write2.ll | 1110 +++++++++++++++++++++
 1 file changed, 1110 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/a-v-ds-write2.ll

diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-write2.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-write2.ll
new file mode 100644
index 0000000000000..143d96aacd3fe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/a-v-ds-write2.ll
@@ -0,0 +1,1110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+; Make sure the register class requirments of ds_write2_* instructions
+; are properly respected when they can use AGPRs. Both data operands
+; together must be VGPR or AGPR.
+
+;---------------------------------------------------------------------
+; b32 cases
+;---------------------------------------------------------------------
+
+; Test a pattern that can form ds_write_b32 with data in AGPRs
+define void @ds_write2_b32_a_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_a_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:40
+; GCN-NEXT:    ds_write_b32 v0, a1 offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %a0 = call i32 asm "; def $0", "=a"()
+  %a1 = call i32 asm "; def $0", "=a"()
+  store i32 %a0, ptr addrspace(3) %gep.0
+  store i32 %a1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b32_a_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_a_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:40
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %a0 = call i32 asm "; def $0", "=a"()
+  %v0 = call i32 asm "; def $0", "=v"()
+  store i32 %a0, ptr addrspace(3) %gep.0
+  store i32 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b32_v_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_v_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:40
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %v0 = call i32 asm "; def $0", "=v"()
+  %a0 = call i32 asm "; def $0", "=a"()
+  store i32 %a0, ptr addrspace(3) %gep.0
+  store i32 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b32_v_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_v_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v2
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %v0 = call i32 asm "; def $0", "=v"()
+  %v1 = call i32 asm "; def $0", "=v"()
+  store i32 %v0, ptr addrspace(3) %gep.0
+  store i32 %v1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b32_av_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_av_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v2
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %av0 = call i32 asm "; def $0", "=^VA"()
+  %av1 = call i32 asm "; def $0", "=^VA"()
+  store i32 %av0, ptr addrspace(3) %gep.0
+  store i32 %av1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+; Should be able to constrain the AV operand to VGPR to enable merging
+define void @ds_write2_b32_av_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_av_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v2
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %av0 = call i32 asm "; def $0", "=^VA"()
+  %v0 = call i32 asm "; def $0", "=v"()
+  store i32 %av0, ptr addrspace(3) %gep.0
+  store i32 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+; Should be able to constrain the AV operand to VGPR to enable merging
+define void @ds_write2_b32_v_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_v_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v2
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %v0 = call i32 asm "; def $0", "=v"()
+  %av0 = call i32 asm "; def $0", "=^VA"()
+  store i32 %v0, ptr addrspace(3) %gep.0
+  store i32 %av0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+; Should be able to constrain the AV operand to AGPR to enable merging
+define void @ds_write2_b32_av_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_av_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:40
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %av0 = call i32 asm "; def $0", "=^VA"()
+  %a0 = call i32 asm "; def $0", "=a"()
+  store i32 %av0, ptr addrspace(3) %gep.0
+  store i32 %a0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+; Should be able to constrain the AV operand to AGPR to enable merging
+define void @ds_write2_b32_a_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_a_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:40
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %a0 = call i32 asm "; def $0", "=a"()
+  %av0 = call i32 asm "; def $0", "=^VA"()
+  store i32 %a0, ptr addrspace(3) %gep.0
+  store i32 %av0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b32_a_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_a_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:256
+; GCN-NEXT:    ds_write_b32 v0, a1 offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %a0 = call i32 asm "; def $0", "=a"()
+  %a1 = call i32 asm "; def $0", "=a"()
+  store i32 %a0, ptr addrspace(3) %gep.0
+  store i32 %a1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b32_a_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_a_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:256
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %a0 = call i32 asm "; def $0", "=a"()
+  %v0 = call i32 asm "; def $0", "=v"()
+  store i32 %a0, ptr addrspace(3) %gep.0
+  store i32 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b32_v_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_v_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:256
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %v0 = call i32 asm "; def $0", "=a"()
+  %a0 = call i32 asm "; def $0", "=v"()
+  store i32 %a0, ptr addrspace(3) %gep.0
+  store i32 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b32_v_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_v_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v2
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2st64_b32 v0, v1, v2 offset0:1 offset1:4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %v0 = call i32 asm "; def $0", "=v"()
+  %v1 = call i32 asm "; def $0", "=v"()
+  store i32 %v0, ptr addrspace(3) %gep.0
+  store i32 %v1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b32_av_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_av_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v2
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2st64_b32 v0, v1, v2 offset0:1 offset1:4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %av0 = call i32 asm "; def $0", "=^VA"()
+  %av1 = call i32 asm "; def $0", "=^VA"()
+  store i32 %av0, ptr addrspace(3) %gep.0
+  store i32 %av1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b32_av_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_av_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v2
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2st64_b32 v0, v1, v2 offset0:1 offset1:4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %av0 = call i32 asm "; def $0", "=^VA"()
+  %v0 = call i32 asm "; def $0", "=v"()
+  store i32 %av0, ptr addrspace(3) %gep.0
+  store i32 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+
+define void @ds_write2st64_b32_v_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_v_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v2
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2st64_b32 v0, v1, v2 offset0:1 offset1:4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %v0 = call i32 asm "; def $0", "=v"()
+  %av1 = call i32 asm "; def $0", "=^VA"()
+  store i32 %v0, ptr addrspace(3) %gep.0
+  store i32 %av1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b32_av_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_av_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:256
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %av0 = call i32 asm "; def $0", "=^VA"()
+  %a0 = call i32 asm "; def $0", "=a"()
+  store i32 %av0, ptr addrspace(3) %gep.0
+  store i32 %a0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+
+define void @ds_write2st64_b32_a_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_a_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b32 v0, a0 offset:256
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %a0 = call i32 asm "; def $0", "=a"()
+  %av1 = call i32 asm "; def $0", "=^VA"()
+  store i32 %a0, ptr addrspace(3) %gep.0
+  store i32 %av1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_av_av_no_vgprs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v0
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a1
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a2
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[0:31]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT:    v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
+; GCN-NEXT:    v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a26, v24 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT:    v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use v[0:31]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %av0 = call i32 asm sideeffect "; def $0", "=^VA"()
+  %av1 = call i32 asm sideeffect "; def $0", "=^VA"()
+  %vgpr.def = call { <32 x i32>, <32 x i32> }  asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+  %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+  %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+  store i32 %av0, ptr addrspace(3) %gep.0
+  store i32 %av1, ptr addrspace(3) %gep.1
+  call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+  ret void
+}
+
+;---------------------------------------------------------------------
+; b64 cases
+;---------------------------------------------------------------------
+
+; Test a pattern that can form ds_write_b64 with data in AGPRs
+define void @ds_write2_b64_a_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_a_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:40
+; GCN-NEXT:    ds_write_b64 v0, a[2:3] offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %a0 = call i64 asm "; def $0", "=a"()
+  %a1 = call i64 asm "; def $0", "=a"()
+  store i64 %a0, ptr addrspace(3) %gep.0
+  store i64 %a1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b64_a_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_a_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:40
+; GCN-NEXT:    ds_write_b64 v0, v[2:3] offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %a0 = call i64 asm "; def $0", "=a"()
+  %v0 = call i64 asm "; def $0", "=v"()
+  store i64 %a0, ptr addrspace(3) %gep.0
+  store i64 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b64_v_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_v_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:40
+; GCN-NEXT:    ds_write_b64 v0, v[2:3] offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %v0 = call i64 asm "; def $0", "=v"()
+  %a0 = call i64 asm "; def $0", "=a"()
+  store i64 %a0, ptr addrspace(3) %gep.0
+  store i64 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b64_v_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_v_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:5 offset1:12
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %v0 = call i64 asm "; def $0", "=v"()
+  %v1 = call i64 asm "; def $0", "=v"()
+  store i64 %v0, ptr addrspace(3) %gep.0
+  store i64 %v1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b64_av_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_av_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:5 offset1:12
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  %av1 = call i64 asm "; def $0", "=^VA"()
+  store i64 %av0, ptr addrspace(3) %gep.0
+  store i64 %av1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+; Should be able to constrain the AV operand to VGPR to enable merging
+define void @ds_write2_b64_av_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_av_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:5 offset1:12
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i64 0, i64 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i64 0, i64 24
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  %v0 = call i64 asm "; def $0", "=v"()
+  store i64 %av0, ptr addrspace(3) %gep.0
+  store i64 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+; Should be able to constrain the AV operand to VGPR to enable merging
+define void @ds_write2_b64_v_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_v_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:5 offset1:12
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i64 0, i64 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i64 0, i64 24
+  %v0 = call i64 asm "; def $0", "=v"()
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  store i64 %v0, ptr addrspace(3) %gep.0
+  store i64 %av0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+; Should be able to constrain the AV operand to AGPR to enable merging
+define void @ds_write2_b64_av_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_av_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, v[2:3] offset:40
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i64 0, i64 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i64 0, i64 24
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  %a0 = call i64 asm "; def $0", "=a"()
+  store i64 %av0, ptr addrspace(3) %gep.0
+  store i64 %a0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+; Should be able to constrain the AV operand to AGPR to enable merging
+define void @ds_write2_b64x_a_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64x_a_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:40
+; GCN-NEXT:    ds_write_b64 v0, v[2:3] offset:96
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i64 0, i64 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i64 0, i64 24
+  %a0 = call i64 asm "; def $0", "=a"()
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  store i64 %a0, ptr addrspace(3) %gep.0
+  store i64 %av0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b64_a_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b64_a_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:256
+; GCN-NEXT:    ds_write_b64 v0, a[2:3] offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %a0 = call i64 asm "; def $0", "=a"()
+  %a1 = call i64 asm "; def $0", "=a"()
+  store i64 %a0, ptr addrspace(3) %gep.0
+  store i64 %a1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b64_a_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b64_a_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:256
+; GCN-NEXT:    ds_write_b64 v0, v[2:3] offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %a0 = call i64 asm "; def $0", "=a"()
+  %v0 = call i64 asm "; def $0", "=v"()
+  store i64 %a0, ptr addrspace(3) %gep.0
+  store i64 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b64_v_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b64_v_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, v[2:3] offset:256
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %v0 = call i64 asm "; def $0", "=a"()
+  %a0 = call i64 asm "; def $0", "=v"()
+  store i64 %a0, ptr addrspace(3) %gep.0
+  store i64 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b64_v_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b64_v_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:32 offset1:128
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %v0 = call i64 asm "; def $0", "=v"()
+  %v1 = call i64 asm "; def $0", "=v"()
+  store i64 %v0, ptr addrspace(3) %gep.0
+  store i64 %v1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b64_av_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b64_av_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:32 offset1:128
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  %av1 = call i64 asm "; def $0", "=^VA"()
+  store i64 %av0, ptr addrspace(3) %gep.0
+  store i64 %av1, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b64_av_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b64_av_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:32 offset1:128
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  %v0 = call i64 asm "; def $0", "=v"()
+  store i64 %av0, ptr addrspace(3) %gep.0
+  store i64 %v0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b64_v_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b64_v_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:32 offset1:128
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %v0 = call i64 asm "; def $0", "=v"()
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  store i64 %v0, ptr addrspace(3) %gep.0
+  store i64 %av0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b64_av_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b64_av_a:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, v[2:3] offset:256
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  %a0 = call i64 asm "; def $0", "=a"()
+  store i64 %av0, ptr addrspace(3) %gep.0
+  store i64 %a0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2st64_b64_a_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b64_a_av:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_write_b64 v0, a[0:1] offset:256
+; GCN-NEXT:    ds_write_b64 v0, v[2:3] offset:1024
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+  %a0 = call i64 asm "; def $0", "=a"()
+  %av0 = call i64 asm "; def $0", "=^VA"()
+  store i64 %a0, ptr addrspace(3) %gep.0
+  store i64 %av0, ptr addrspace(3) %gep.1
+  ret void
+}
+
+define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_av_av_no_vgprs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v40 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v41 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v42 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v43 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v44 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v45 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v46 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v47 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v56 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v57 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v58 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a16, v59 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a17, v60 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a18, v61 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a19, v62 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a20, v63 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v0
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[2:3]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def a[4:5]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def v[0:31]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT:    v_accvgpr_write_b32 a21, v31 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:5 offset1:12
+; GCN-NEXT:    v_accvgpr_write_b32 a31, v21 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a30, v22 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a29, v23 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a28, v24 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a27, v25 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a26, v26 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a25, v27 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a24, v28 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a23, v29 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_write_b32 a22, v30 ; Reload Reuse
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GCN-NEXT:    v_accvgpr_read_b32 v21, a31 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v22, a30 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v23, a29 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v24, a28 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v25, a27 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v26, a26 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v27, a25 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v28, a24 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v29, a23 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v30, a22 ; Reload Reuse
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_accvgpr_read_b32 v31, a21 ; Reload Reuse
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use v[0:31]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_accvgpr_read_b32 v63, a20 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v62, a19 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v61, a18 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v60, a17 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v59, a16 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v58, a15 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v57, a14 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v56, a13 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v47, a12 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v46, a11 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v45, a10 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v44, a9 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v43, a8 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v42, a7 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v41, a6 ; Reload Reuse
+; GCN-NEXT:    v_accvgpr_read_b32 v40, a1 ; Reload Reuse
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+  %av0 = call i64 asm sideeffect "; def $0", "=^VA"()
+  %av1 = call i64 asm sideeffect "; def $0", "=^VA"()
+  %vgpr.def = call { <32 x i32>, <32 x i32> }  asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+  %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+  %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+  store i64 %av0, ptr addrspace(3) %gep.0
+  store i64 %av1, ptr addrspace(3) %gep.1
+  call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }