[llvm] r363946 - [AMDGPU] gfx10 tests. NFC.

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 20 09:29:40 PDT 2019


Author: rampitec
Date: Thu Jun 20 09:29:40 2019
New Revision: 363946

URL: http://llvm.org/viewvc/llvm-project?rev=363946&view=rev
Log:
[AMDGPU] gfx10 tests. NFC.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
    llvm/trunk/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll
    llvm/trunk/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll
    llvm/trunk/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking-wave32.mir
    llvm/trunk/test/MC/AMDGPU/gfx10-vop2be-literal.s
    llvm/trunk/test/MC/AMDGPU/gfx10_asm_all.s
    llvm/trunk/test/MC/AMDGPU/gfx10_asm_dpp16.s
    llvm/trunk/test/MC/AMDGPU/gfx10_asm_dpp8.s
    llvm/trunk/test/MC/AMDGPU/gfx10_asm_err.s
    llvm/trunk/test/MC/AMDGPU/lds_direct-gfx10.s
    llvm/trunk/test/MC/Disassembler/AMDGPU/gfx10-vop2be-literal.txt
    llvm/trunk/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
    llvm/trunk/test/MC/Disassembler/AMDGPU/gfx10_dasm_dpp16.txt
    llvm/trunk/test/MC/Disassembler/AMDGPU/gfx10_dasm_dpp8.txt
Modified:
    llvm/trunk/test/CodeGen/AMDGPU/hsa.ll
    llvm/trunk/test/CodeGen/AMDGPU/idot2.ll
    llvm/trunk/test/CodeGen/AMDGPU/idot4s.ll
    llvm/trunk/test/CodeGen/AMDGPU/idot4u.ll
    llvm/trunk/test/CodeGen/AMDGPU/idot8s.ll
    llvm/trunk/test/CodeGen/AMDGPU/idot8u.ll
    llvm/trunk/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll

Added: llvm/trunk/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/gfx10-vop-literal.ll?rev=363946&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/gfx10-vop-literal.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/gfx10-vop-literal.ll Thu Jun 20 09:29:40 2019
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; GNC-LABEL: {{^}}test_add_lit:
+; GFX10: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, 0x80992bff, v{{[0-9]+}}
+; GFX10: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0xe7, v{{[0-9]+}}, vcc_lo
+; GFX9:  v_mov_b32_e32 [[C2:v[0-9]+]], 0xe7
+; GFX9:  v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x80992bff, v{{[0-9]+}}
+; GFX9:  v_addc_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, [[C2]], vcc
+define amdgpu_kernel void @test_add_lit(i64 addrspace(1)* %p) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %ptr = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %id
+  %load = load i64, i64 addrspace(1)* %ptr, align 8
+  %add = add nsw i64 %load, 994294967295
+  store i64 %add, i64 addrspace(1)* %ptr, align 8
+  ret void
+}
+
+; GNC-LABEL: {{^}}test_cndmask_lit:
+; GFX10: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3039, v{{[0-9]+}}, vcc_lo
+; GFX9:  v_mov_b32_e32 [[C:v[0-9]+]], 0x3039
+; GFX9:  v_cndmask_b32_e32 v{{[0-9]+}}, [[C]], v{{[0-9]+}}, vcc
+define amdgpu_kernel void @test_cndmask_lit(i32 addrspace(1)* %p) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %n = add nuw nsw i32 %id, 1
+  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %id
+  %v1 = load i32, i32 addrspace(1)* %p1, align 4
+  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %n
+  %v2 = load i32, i32 addrspace(1)* %p2, align 4
+  %cmp = icmp sgt i32 %v1, 0
+  %sel = select i1 %cmp, i32 12345, i32 %v2
+  store i32 %sel, i32 addrspace(1)* %p1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bfe_2lit_s:
+; GFX10: v_mov_b32_e32 [[C1:v[0-9]+]], 0xddd5
+; GFX10: v_bfe_u32 v{{[0-9]+}}, 0x3039, s{{[0-9]+}}, [[C1]]
+; GFX9-DAG: v_mov_b32_e32 [[C2:v[0-9]+]], 0xddd5
+; GFX9-DAG: s_movk_i32 [[C1:s[0-9]+]], 0x3039
+; GFX9:     v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, [[C2]]
+define amdgpu_kernel void @test_bfe_2lit_s(i32 addrspace(1)* %p, i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 12345, i32 %src, i32 56789)
+  store i32 %bfe, i32 addrspace(1)* %p, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bfe_2lit_v:
+; GFX10: s_movk_i32 [[C1:s[0-9]+]], 0x3039
+; GFX10: v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, 0xddd5
+; GFX9-DAG: v_mov_b32_e32 [[C2:v[0-9]+]], 0xddd5
+; GFX9-DAG: s_movk_i32 [[C1:s[0-9]+]], 0x3039
+; GFX9:     v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, [[C2]]
+define amdgpu_kernel void @test_bfe_2lit_v(i32 addrspace(1)* %p) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %id
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 12345, i32 %load, i32 56789)
+  store i32 %bfe, i32 addrspace(1)* %ptr, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32)

Modified: llvm/trunk/test/CodeGen/AMDGPU/hsa.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/hsa.ll?rev=363946&r1=363945&r2=363946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/hsa.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/hsa.ll Thu Jun 20 09:29:40 2019
@@ -2,8 +2,10 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-CI %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-VI %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj -mattr=-code-object-v3 | llvm-readobj --symbols -S --sd | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-readobj --symbols -S --sd | FileCheck %s --check-prefix=ELF
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+WavefrontSize32,-WavefrontSize64,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
 
 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
 ; directives.
@@ -45,7 +47,15 @@
 ; HSA: .amd_kernel_code_t
 ; HSA: enable_sgpr_private_segment_buffer = 1
 ; HSA: enable_sgpr_kernarg_segment_ptr = 1
-; HSA: wavefront_size = 6
+
+; PRE-GFX10: enable_wavefront_size32 = 0
+; GFX10-W32: enable_wavefront_size32 = 1
+; GFX10-W64: enable_wavefront_size32 = 0
+
+; PRE-GFX10: wavefront_size = 6
+; GFX10-W32: wavefront_size = 5
+; GFX10-W64: wavefront_size = 6
+
 ; HSA: call_convention = -1
 ; HSA: .end_amd_kernel_code_t
 ; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
@@ -55,7 +65,8 @@
 ; On VI+ we also need to set MTYPE = 2
 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
 ; Make sure we generate flat store for HSA
-; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; PRE-GFX10: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; GFX10: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 
 ; HSA: .Lfunc_end0:
 ; HSA: .size   simple, .Lfunc_end0-simple

Modified: llvm/trunk/test/CodeGen/AMDGPU/idot2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/idot2.ll?rev=363946&r1=363945&r2=363946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/idot2.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/idot2.ll Thu Jun 20 09:29:40 2019
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
 
 ; add(mul(S0.x, S1.y),
 ;     add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
@@ -96,6 +98,23 @@ define amdgpu_kernel void @udot2(<2 x i1
 ; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
                                  i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -220,6 +239,29 @@ define amdgpu_kernel void @udot2_MulMul(
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2_MulMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 16
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v2, s1, s0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s5, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                         <2 x i16> addrspace(1)* %src2,
                                         i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -330,6 +372,23 @@ define amdgpu_kernel void @idot2(<2 x i1
 ; GFX9-DL-NEXT:    v_dot2_i32_i16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot2:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    v_dot2_i32_i16 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                  <2 x i16> addrspace(1)* %src2,
                                  i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -447,6 +506,28 @@ define amdgpu_kernel void @idot2_MixedTy
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot2_MixedTypedMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-DL-NEXT:    s_sext_i32_i16 s3, s3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                <2 x i16> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -561,6 +642,23 @@ define amdgpu_kernel void @udot2_alt_Add
 ; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2_alt_AddOperands:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                  <2 x i16> addrspace(1)* %src2,
                                                  i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -678,6 +776,28 @@ define amdgpu_kernel void @idot2_MixedEx
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot2_MixedExt:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 16
+; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                           <2 x i16> addrspace(1)* %src2,
                                           i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -779,6 +899,25 @@ define amdgpu_kernel void @notudot2_Same
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, s0, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: notudot2_SameVec:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s1, s4
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s0, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                             <2 x i16> addrspace(1)* %src2,
                                             i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -893,6 +1032,23 @@ define amdgpu_kernel void @udot2_v4i16(<
 ; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2_v4i16:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i16> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1007,6 +1163,23 @@ define amdgpu_kernel void @udot2_v4i16_H
 ; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2_v4i16_Hi:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x4
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x4
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                           <4 x i16> addrspace(1)* %src2,
                                           i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1128,6 +1301,29 @@ define amdgpu_kernel void @notudot2_v4i1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: notudot2_v4i16_Even:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_mov_b32 s8, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s3, s8
+; GFX10-DL-NEXT:    s_and_b32 s1, s5, s8
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-DL-NEXT:    s_and_b32 s2, s2, s8
+; GFX10-DL-NEXT:    s_and_b32 s3, s4, s8
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i16> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1249,6 +1445,29 @@ define amdgpu_kernel void @notudot2_v4i1
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: notudot2_v4i16_Middle:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_mov_b32 s8, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s3, s8
+; GFX10-DL-NEXT:    s_and_b32 s1, s5, s8
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 16
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                  <4 x i16> addrspace(1)* %src2,
                                                  i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1370,6 +1589,29 @@ define amdgpu_kernel void @notudot2_Diff
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: notudot2_DiffIndex:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_and_b32 s2, s3, s2
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 16
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <2 x i16> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1495,6 +1737,30 @@ define amdgpu_kernel void @udot2_Multipl
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2_MultipleUses_add1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s2, s3, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1618,6 +1884,29 @@ define amdgpu_kernel void @idot2_Multipl
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot2_MultipleUses_add1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 16
+; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-DL-NEXT:    s_sext_i32_i16 s3, s3
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s3, s2, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1745,6 +2034,30 @@ define amdgpu_kernel void @udot2_Multipl
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 16
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1869,6 +2182,29 @@ define amdgpu_kernel void @idot2_Multipl
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_sext_i32_i16 s0, s2
+; GFX10-DL-NEXT:    s_sext_i32_i16 s1, s3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX10-DL-NEXT:    s_ashr_i32 s3, s3, 16
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1997,6 +2333,30 @@ define amdgpu_kernel void @udot2_Multipl
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_and_b32 s3, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -2121,6 +2481,29 @@ define amdgpu_kernel void @idot2_Multipl
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s1, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 16
+; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 16
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-DL-NEXT:    s_sext_i32_i16 s3, s3
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                    <2 x i16> addrspace(1)* %src2,
                                                    i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -2238,6 +2621,23 @@ define amdgpu_kernel void @udot2_acc16(<
 ; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2_acc16:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot2_u32_u16 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <2 x i16> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
 entry:
@@ -2370,6 +2770,36 @@ define amdgpu_kernel void @notsdot2_sext
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v5, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: notsdot2_sext8:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX10-DL-NEXT:    global_load_ushort v7, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DL-NEXT:    v_and_b32_sdwa v1, v2, v4
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_and_b32_sdwa v3, v7, v4
+; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v0, v7, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mad_i32_i24 v1, v3, v1, s2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, v0, v2, v1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                           <2 x i8> addrspace(1)* %src2,
                                           i32 addrspace(1)* nocapture %dst) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/idot4s.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/idot4s.ll?rev=363946&r1=363945&r2=363946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/idot4s.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/idot4s.ll Thu Jun 20 09:29:40 2019
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 
 define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: idot4_acc32:
@@ -114,6 +116,23 @@ define amdgpu_kernel void @idot4_acc32(<
 ; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s2, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    v_dot4_i32_i8 v2, s2, s3, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -274,6 +293,23 @@ define amdgpu_kernel void @idot4_acc16(<
 ; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc16:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_i32_i8 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
 entry:
@@ -426,6 +462,23 @@ define amdgpu_kernel void @idot4_acc8(<4
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc8:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                       <4 x i8> addrspace(1)* %src2,
                                       i8 addrspace(1)* nocapture %dst) {
 entry:
@@ -585,6 +638,35 @@ define amdgpu_kernel void @idot4_multius
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_multiuse_mul1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_sext_i32_i8 s0, s2
+; GFX10-DL-NEXT:    s_sext_i32_i8 s1, s3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x80008
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s3, 0x80008
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x80010
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s3, 0x80010
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 24
+; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 24
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -754,6 +836,37 @@ define amdgpu_kernel void @idot4_acc32_v
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc32_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_and_b32_sdwa v3, s2, v2
+; GFX10-DL-NEXT:    v_and_b32_sdwa v2, s3, v2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-DL-NEXT:    s_sext_i32_i8 s0, s2
+; GFX10-DL-NEXT:    s_sext_i32_i8 s1, s3
+; GFX10-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX10-DL-NEXT:    s_bfe_i32 s4, s2, 0x80010
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s3, 0x80010
+; GFX10-DL-NEXT:    v_mad_i32_i24 v4, s0, s1, v4
+; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 24
+; GFX10-DL-NEXT:    s_ashr_i32 s1, s3, 24
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, v3, v2, v4
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -939,6 +1052,47 @@ define amdgpu_kernel void @idot4_acc16_v
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot4_acc16_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_bfe_i32 s0, s2, 0x80000
+; GFX10-DL-NEXT:    s_bfe_i32 s1, s3, 0x80000
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, sext(s2), v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, s0, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, s1, v2
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, sext(s3), v2
+; GFX10-DL-NEXT:    s_bfe_i32 s0, s4, 0x80000
+; GFX10-DL-NEXT:    s_bfe_i32 s1, s5, 0x80000
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v7
+; GFX10-DL-NEXT:    v_and_b32_sdwa v8, sext(s4), v2
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, sext(s5), v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, s1, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, s0, v2
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v6, 16, v7
+; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v5
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v4, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/idot4u.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/idot4u.ll?rev=363946&r1=363945&r2=363946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/idot4u.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/idot4u.ll Thu Jun 20 09:29:40 2019
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 
 define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: udot4_acc32:
@@ -117,6 +119,23 @@ define amdgpu_kernel void @udot4_acc32(<
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s2, s3, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -270,6 +289,23 @@ define amdgpu_kernel void @udot4_acc16(<
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc16:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <4 x i8> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
 entry:
@@ -423,6 +459,23 @@ define amdgpu_kernel void @udot4_acc8(<4
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc8:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                       <4 x i8> addrspace(1)* %src2,
                                       i8 addrspace(1)* nocapture %dst) {
 entry:
@@ -552,6 +605,29 @@ define amdgpu_kernel void @udot2_8(<4 x
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot2_8:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s4, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s3, s3, 0x80008
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                    <4 x i8> addrspace(1)* %src2,
                                    i8 addrspace(1)* nocapture %dst) {
 entry:
@@ -686,6 +762,23 @@ define amdgpu_kernel void @udot4_Commuta
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s3, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-DL-NEXT:    v_dot4_u32_u8 v2, s1, s0, v2
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                       <4 x i8> addrspace(1)* %src2,
                                                       i8 addrspace(1)* nocapture %dst) {
 entry:
@@ -847,6 +940,35 @@ define amdgpu_kernel void @udot4_Commuta
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s3, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x80008
+; GFX10-DL-NEXT:    s_and_b32 s5, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s3, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s4, 0x80010
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s6, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                         <4 x i8> addrspace(1)* %src2,
                                                         i8 addrspace(1)* nocapture %dst) {
 entry:
@@ -1011,6 +1133,36 @@ define amdgpu_kernel void @udot4_multius
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_multiuse_mul1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s3, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s4, 0x80008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s5, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s3, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s4, 0x80010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1188,6 +1340,37 @@ define amdgpu_kernel void @udot4_multius
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_multiuse_add1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s3, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x80008
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_and_b32 s6, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s3, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x80010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s6, s2, v2
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s3, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s4, 24
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s0, s1, v3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s2, s3, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                <4 x i8> addrspace(1)* %src2,
                                                i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1356,6 +1539,34 @@ define amdgpu_kernel void @notdot4_mixed
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: notdot4_mixedtypes:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x80008
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s3, 0x80008
+; GFX10-DL-NEXT:    s_sext_i32_i8 s4, s2
+; GFX10-DL-NEXT:    s_sext_i32_i8 s5, s3
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s2, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s3, 0x80010
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s2, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 24
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s6, s7, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {
 entry:
@@ -1522,6 +1733,36 @@ define amdgpu_kernel void @udot4_acc32_v
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc32_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s3, s2
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, s2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s3, 0x80010
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s4, 0x80010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s0, s1, v3
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 24
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v4, v2, v3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s5, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1693,6 +1934,43 @@ define amdgpu_kernel void @udot4_acc16_v
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc16_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v7
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
+; GFX10-DL-NEXT:    s_lshr_b32 s3, s3, 24
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT:    v_lshl_or_b32 v5, s3, 16, v6
+; GFX10-DL-NEXT:    v_lshl_or_b32 v2, s2, 16, v2
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v5
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v4, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <4 x i8> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {
 entry:
@@ -1874,6 +2152,46 @@ define amdgpu_kernel void @udot4_acc8_ve
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot4_acc8_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v3, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s3, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s4, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s3, s4
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, s0, s5
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v8, s1, s6
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v5
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v7, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v8, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v2, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                              <4 x i8> addrspace(1)* %src2,
                                              i8 addrspace(1)* nocapture %dst) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/idot8s.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/idot8s.ll?rev=363946&r1=363945&r2=363946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/idot8s.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/idot8s.ll Thu Jun 20 09:29:40 2019
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 
 define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc32:
@@ -162,6 +164,23 @@ define amdgpu_kernel void @idot8_acc32(<
 ; GFX9-DL-NEXT:    v_dot8_i32_i4 v2, s2, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot8_acc32:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    v_dot8_i32_i4 v2, s2, s4, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -449,6 +468,56 @@ define amdgpu_kernel void @idot8_acc16(<
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot8_acc16:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v3, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s2, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 12
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s4, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s2, 0x40004
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s0
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s1
+; GFX10-DL-NEXT:    s_bfe_i32 s0, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_i32 s1, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40008
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, v5, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s9, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_i32 s10, s4, 0x40010
+; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v6, s1, s8
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v5
+; GFX10-DL-NEXT:    s_bfe_i32 s1, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40014
+; GFX10-DL-NEXT:    s_bfe_i32 s11, s2, 0x40018
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s12, s4, 0x40018
+; GFX10-DL-NEXT:    s_ashr_i32 s2, s2, 28
+; GFX10-DL-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s5, s6, v3
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s7, s0, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, v4, v2, v3
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s9, s10, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s1, s8, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s11, s12, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
 entry:
@@ -744,6 +813,57 @@ define amdgpu_kernel void @idot8_acc8(<8
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot8_acc8:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:   s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:   s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:   v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:   s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:   ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:   s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:   s_load_dword s4, s[4:5], 0x0
+; GFX10-DL-NEXT:   s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT:   v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:   v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:   global_load_ubyte v3, v[0:1], off
+; GFX10-DL-NEXT:   s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:   s_lshr_b32 s0, s4, 12
+; GFX10-DL-NEXT:   s_lshr_b32 s1, s5, 12
+; GFX10-DL-NEXT:   s_bfe_i32 s6, s4, 0x40000
+; GFX10-DL-NEXT:   s_bfe_i32 s7, s5, 0x40000
+; GFX10-DL-NEXT:   s_bfe_i32 s8, s4, 0x40004
+; GFX10-DL-NEXT:   v_lshlrev_b16_e64 v4, 12, s0
+; GFX10-DL-NEXT:   v_lshlrev_b16_e64 v5, 12, s1
+; GFX10-DL-NEXT:   s_bfe_i32 s0, s5, 0x40004
+; GFX10-DL-NEXT:   s_bfe_i32 s1, s4, 0x40008
+; GFX10-DL-NEXT:   s_bfe_i32 s9, s5, 0x40008
+; GFX10-DL-NEXT:   v_and_b32_e32 v4, v4, v2
+; GFX10-DL-NEXT:   v_and_b32_e32 v2, v5, v2
+; GFX10-DL-NEXT:   s_bfe_i32 s10, s4, 0x40010
+; GFX10-DL-NEXT:   s_bfe_i32 s11, s5, 0x40010
+; GFX10-DL-NEXT:   v_mul_i32_i24_e64 v5, s1, s9
+; GFX10-DL-NEXT:   v_ashrrev_i16_e64 v4, 12, v4
+; GFX10-DL-NEXT:   v_ashrrev_i16_e64 v2, 12, v2
+; GFX10-DL-NEXT:   s_bfe_i32 s1, s4, 0x40014
+; GFX10-DL-NEXT:   s_bfe_i32 s9, s5, 0x40014
+; GFX10-DL-NEXT:   s_bfe_i32 s12, s4, 0x40018
+; GFX10-DL-NEXT:   v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:   v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:   s_bfe_i32 s2, s5, 0x40018
+; GFX10-DL-NEXT:   s_ashr_i32 s4, s4, 28
+; GFX10-DL-NEXT:   s_ashr_i32 s5, s5, 28
+; GFX10-DL-NEXT:   s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:   v_mad_i32_i24 v3, s6, s7, v3
+; GFX10-DL-NEXT:   v_mad_i32_i24 v3, s8, s0, v3
+; GFX10-DL-NEXT:   v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:   v_mad_u32_u24 v2, v4, v2, v3
+; GFX10-DL-NEXT:   v_mad_i32_i24 v2, s10, s11, v2
+; GFX10-DL-NEXT:   v_mad_i32_i24 v2, s1, s9, v2
+; GFX10-DL-NEXT:   v_mad_i32_i24 v2, s12, s2, v2
+; GFX10-DL-NEXT:   v_mad_i32_i24 v2, s4, s5, v2
+; GFX10-DL-NEXT:   global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:   s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i8 addrspace(1)* nocapture %dst) {
 entry:
@@ -1010,6 +1130,48 @@ define amdgpu_kernel void @idot8_multius
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot8_multiuses_mul1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_bfe_i32 s0, s2, 0x40000
+; GFX10-DL-NEXT:    s_bfe_i32 s1, s4, 0x40000
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_bfe_i32 s5, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_i32 s6, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_i32 s7, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_i32 s8, s4, 0x40008
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s9, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_i32 s10, s4, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_i32 s11, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_i32 s12, s4, 0x40010
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_i32 s0, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_i32 s1, s4, 0x40014
+; GFX10-DL-NEXT:    s_bfe_i32 s13, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_i32 s14, s4, 0x40018
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s5, s6, v3
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s7, s8, v3
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s9, s10, v3
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s11, s12, v3
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s0, s1, v3
+; GFX10-DL-NEXT:    s_ashr_i32 s0, s2, 28
+; GFX10-DL-NEXT:    s_ashr_i32 s1, s4, 28
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s13, s14, v3
+; GFX10-DL-NEXT:    v_mad_i32_i24 v3, s0, s1, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                 <8 x i4> addrspace(1)* %src2,
                                                 i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1325,6 +1487,60 @@ define amdgpu_kernel void @idot8_acc32_v
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot8_acc32_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s5, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s7, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshl_b32 s1, s5, 28
+; GFX10-DL-NEXT:    s_lshl_b32 s9, s7, 28
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s5, 24
+; GFX10-DL-NEXT:    s_lshl_b32 s13, s7, 24
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s1, s5, 20
+; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s9, s7, 20
+; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[12:13], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s8, v2
+; GFX10-DL-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s5, 16
+; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s1, s7, 16
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s10, s12, v2
+; GFX10-DL-NEXT:    s_lshl_b32 s9, s5, 12
+; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[10:11], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s7, 12
+; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[0:1], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s8, v2
+; GFX10-DL-NEXT:    s_lshl_b32 s1, s5, 8
+; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[8:9], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[14:15], s[10:11], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s9, s7, 8
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s10, s12, v2
+; GFX10-DL-NEXT:    s_ashr_i64 s[0:1], s[0:1], 60
+; GFX10-DL-NEXT:    s_lshl_b32 s11, s5, 4
+; GFX10-DL-NEXT:    s_lshl_b32 s1, s7, 4
+; GFX10-DL-NEXT:    s_ashr_i64 s[12:13], s[8:9], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s8, s14, v2
+; GFX10-DL-NEXT:    s_ashr_i64 s[8:9], s[10:11], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[10:11], s[0:1], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s12, v2
+; GFX10-DL-NEXT:    s_ashr_i64 s[0:1], s[4:5], 60
+; GFX10-DL-NEXT:    s_ashr_i64 s[4:5], s[6:7], 60
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s8, s10, v2
+; GFX10-DL-NEXT:    v_mad_i32_i24 v2, s0, s4, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1635,6 +1851,74 @@ define amdgpu_kernel void @idot8_acc16_v
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot8_acc16_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s5, s4, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40008
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x4000c
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40010
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40010
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40014
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s7 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s6, s8
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40018
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s5, s2
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s6, s4
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v8, 12, s1 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v6
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v7, 12, v8 op_sel_hi:[0,1]
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v7, v6, v7
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v7
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {
 entry:
@@ -2005,6 +2289,134 @@ define amdgpu_kernel void @idot8_acc8_ve
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: idot8_acc8_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v3, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s4, 4
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 8
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s4, 12
+; GFX10-DL-NEXT:    s_lshr_b32 s7, s5, 4
+; GFX10-DL-NEXT:    s_lshr_b32 s8, s5, 8
+; GFX10-DL-NEXT:    s_lshr_b32 s9, s5, 12
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s6
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s0
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s8
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s9
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s4
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v19, 12, s7
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s5
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, v5, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, v15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, v7, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, v8, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, v9, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, v19, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v11, v11, v2
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v5
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v8
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v9
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v10
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v11
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v15, 12, v6
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v7
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s4, 20
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s4, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s8, s5, 20
+; GFX10-DL-NEXT:    s_lshr_b32 s9, s5, 24
+; GFX10-DL-NEXT:    s_lshr_b32 s7, s5, 16
+; GFX10-DL-NEXT:    s_lshr_b32 s5, s5, 28
+; GFX10-DL-NEXT:    v_and_b32_e32 v23, v15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, v19, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, v5, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, v8, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, v9, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v22, v7, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v11, v11, v2
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v5, v8
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s4
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v9
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s6
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v22, v11
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s0
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v20, 12, s8
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v23, v23, v10
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v21, 12, s1
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v16, 12, s5
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v17, 12, s9
+; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v19, 12, s7
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, v12, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, v13, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v11, v15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v12, v16, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v13, v17, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v15, v19, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, v21, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v14, v20, v2
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v23, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v12
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v13, 12, v13
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v8
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v9
+; GFX10-DL-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v10
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v15, 12, v15
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v11
+; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v23, 12, v14
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, v8, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, v9, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v13, v13, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, v11, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v12, v12, v2
+; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, v15, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, v19, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v11, v23, v2
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v7, v13
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, v5, v12
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, v9, v10
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v4
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v8, v8, v11
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v6, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v4, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
                                              i8 addrspace(1)* nocapture %dst) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/idot8u.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/idot8u.ll?rev=363946&r1=363945&r2=363946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/idot8u.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/idot8u.ll Thu Jun 20 09:29:40 2019
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 
 define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc32:
@@ -162,6 +164,23 @@ define amdgpu_kernel void @udot8_acc32(<
 ; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s2, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_acc32:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s2, s4, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -423,6 +442,47 @@ define amdgpu_kernel void @udot8_acc16(<
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_acc16:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s11, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s12, s4, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s13, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s14, s4, 0x40014
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s8, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s9, s10, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s11, s12, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s13, s14, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                        <8 x i4> addrspace(1)* %src2,
                                        i16 addrspace(1)* nocapture %dst) {
 entry:
@@ -684,6 +744,47 @@ define amdgpu_kernel void @udot8_acc8(<8
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_acc8:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s11, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s12, s4, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s13, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s14, s4, 0x40014
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s8, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s9, s10, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s11, s12, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s13, s14, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                       <8 x i4> addrspace(1)* %src2,
                                       i8 addrspace(1)* nocapture %dst) {
 entry:
@@ -955,6 +1056,50 @@ define amdgpu_kernel void @udot8_acc4(<8
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_acc4:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s4, 0x40008
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s4, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s8, s0
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s9, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                       <8 x i4> addrspace(1)* %src2,
                                       i4 addrspace(1)* nocapture %dst) {
 entry:
@@ -1210,6 +1355,50 @@ define amdgpu_kernel void @udot8_Commuta
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s4, 0x40008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s8, s1
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s0, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40010
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                       <8 x i4> addrspace(1)* %src2,
                                                       i4 addrspace(1)* nocapture %dst) {
 entry:
@@ -1458,6 +1647,48 @@ define amdgpu_kernel void @udot8_multius
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_multiuses_mul1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40008
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s11, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s12, s4, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s5, s6, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s13, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s14, s4, 0x40018
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s7, s8, v3
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s9, s10, v3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s11, s12, v3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s5, s6, v3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s13, s14, v3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v3, s2, s4, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                                 <8 x i4> addrspace(1)* %src2,
                                                 i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1686,6 +1917,23 @@ define amdgpu_kernel void @udot8_acc32_v
 ; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s2, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_acc32_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s2, s4, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
                                               i32 addrspace(1)* nocapture %dst) {
 entry:
@@ -1946,6 +2194,58 @@ define amdgpu_kernel void @udot8_acc16_v
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_acc16_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s2, 0x40004
+; GFX10-DL-NEXT:    s_and_b32 s5, s4, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40008
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x4000c
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x40010
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s0, s5
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s1, s6
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s4, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s7, s0
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s8, s1
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, s1, s5
+; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s6, s4
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, s0, s1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                               <8 x i4> addrspace(1)* %src2,
                                               i16 addrspace(1)* nocapture %dst) {
 entry:
@@ -2246,6 +2546,67 @@ define amdgpu_kernel void @udot8_acc8_ve
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_acc8_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v3, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x4000c
+; GFX10-DL-NEXT:    s_and_b32 s7, s2, 15
+; GFX10-DL-NEXT:    s_and_b32 s9, s4, 15
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, s0, s5
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x40008
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s1, s6
+; GFX10-DL-NEXT:    s_bfe_u32 s10, s4, 0x40008
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, s7, s9
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40014
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, s8, s10
+; GFX10-DL-NEXT:    s_lshr_b32 s1, s2, 28
+; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT:    s_lshr_b32 s6, s4, 28
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s4, 0x40014
+; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s4, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s2, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s4, s4, 0x40018
+; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v5, s0, s5
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v11, s1, s6
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v8, s7, s8
+; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v9, s2, s4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v4
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v6, v4, v3
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v6, v7
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
                                              i8 addrspace(1)* nocapture %dst) {
 entry:
@@ -2481,6 +2842,50 @@ define amdgpu_kernel void @udot8_acc4_ve
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_acc4_vecMul:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-DL-NEXT:    s_and_b32 s1, s4, 15
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40004
+; GFX10-DL-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX10-DL-NEXT:    s_bfe_u32 s8, s2, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s9, s4, 0x40008
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s4, 0x4000c
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40010
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e64 v3, s8, s0
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40010
+; GFX10-DL-NEXT:    s_bfe_u32 s5, s2, 0x40014
+; GFX10-DL-NEXT:    s_bfe_u32 s6, s4, 0x40014
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s7, s9, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    s_bfe_u32 s0, s2, 0x40018
+; GFX10-DL-NEXT:    s_bfe_u32 s1, s4, 0x40018
+; GFX10-DL-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX10-DL-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s5, s6, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s0, s1, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX10-DL-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                              <8 x i4> addrspace(1)* %src2,
                                              i4 addrspace(1)* nocapture %dst) {
 entry:
@@ -2669,6 +3074,23 @@ define amdgpu_kernel void @udot8_variant
 ; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s3, v2, v3
 ; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: udot8_variant1:
+; GFX10-DL:       ; %bb.0: ; %entry
+; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT:    v_dot8_u32_u4 v2, s3, s2, v2
+; GFX10-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT:    s_endpgm
                                           i32 addrspace(1)* %v2addr,
                                           i32 addrspace(1)* %dst) {
 entry:

Added: llvm/trunk/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll?rev=363946&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll Thu Jun 20 09:29:40 2019
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+; GCN-LABEL: _amdgpu_hs_main:
+
+define amdgpu_hs void @_amdgpu_hs_main() #0 {
+.entry:
+  ret void
+}
+
+; GCN-LABEL: _amdgpu_ps_main:
+; GCN: s_and_saveexec_b64
+
+define amdgpu_ps void @_amdgpu_ps_main(i32 %arg) local_unnamed_addr #1 {
+.entry:
+  %tmp = tail call float @llvm.amdgcn.interp.p2(float undef, float undef, i32 1, i32 0, i32 %arg) #2
+  %tmp1 = tail call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float undef, float %tmp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
+  %tmp2 = fcmp olt float %tmp1, 5.000000e-01
+  br i1 %tmp2, label %bb, label %l
+
+bb:                                               ; preds = %.entry
+  unreachable
+
+l: ; preds = %.entry
+  ret void
+}
+
+; GCN-LABEL: _amdgpu_gs_main:
+
+define amdgpu_gs void @_amdgpu_gs_main() #4 {
+.entry:
+  ret void
+}
+
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
+declare float @llvm.amdgcn.image.sample.2d.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
+
+attributes #0 = { "amdgpu-max-work-group-size"="128" "target-features"=",+wavefrontsize32" }
+attributes #1 = { "target-features"=",+wavefrontsize64" }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { nounwind readonly }
+attributes #4 = { "target-features"=",+wavefrontsize32" }

Added: llvm/trunk/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll?rev=363946&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/mixed_wave32_wave64.ll Thu Jun 20 09:29:40 2019
@@ -0,0 +1,107 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+;
+; Check that PS is wave64
+; GFX10-LABEL: _amdgpu_ps_main:
+; GFX10: s_and_saveexec_b64
+;
+; Check that VS is wave32
+; GFX10-LABEL: _amdgpu_vs_main:
+; GFX10: s_and_saveexec_b32
+;
+; Check that GS is wave32
+; GFX10-LABEL: _amdgpu_gs_main:
+; GFX10: s_and_saveexec_b32
+;
+; Check that HS is wave32
+; GFX10-LABEL: _amdgpu_hs_main:
+; GFX10: s_and_saveexec_b32
+;
+; Check that CS is wave32
+; GFX10-LABEL: _amdgpu_cs_main:
+; GFX10: s_and_saveexec_b32
+;
+; Check that:
+; PS_W32_EN (bit 15) of SPI_PS_IN_CONTROL (0xa1b6) is 0;
+; VS_W32_EN (bit 23) of VGT_SHADER_STAGES_EN (0xa2d5) is 1;
+; GS_W32_EN (bit 22) of VGT_SHADER_STAGES_EN (0xa2d5) is 1;
+; HS_W32_EN (bit 21) of VGT_SHADER_STAGES_EN (0xa2d5) is 1;
+; CS_W32_EN (bit 15) of COMPUTE_DISPATCH_INITIATOR (0x2e00) is 1.
+;
+; GFX10: .amd_amdgpu_pal_metadata{{.*}},0x2e00,0x8000,{{.*}}0xa1b6,0x1,{{.*}},0xa2d5,0xe00000,
+
+define dllexport amdgpu_ps void @_amdgpu_ps_main(float %arg10) #0 {
+.entry:
+  %tmp100 = fcmp ogt float %arg10, 0.25
+  br i1 %tmp100, label %if, label %endif
+if:
+  %tmp101 = fadd float %arg10, 0.125
+  br label %endif
+endif:
+  %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 true, i1 true)
+  ret void
+}
+
+define dllexport amdgpu_vs void @_amdgpu_vs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, float %arg10) local_unnamed_addr #2 {
+.entry:
+  %tmp100 = fcmp ogt float %arg10, 0.25
+  br i1 %tmp100, label %if, label %endif
+if:
+  %tmp101 = fadd float %arg10, 0.125
+  br label %endif
+endif:
+  %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 false, i1 false)
+  ret void
+}
+
+define dllexport amdgpu_gs void @_amdgpu_gs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, float %arg10) local_unnamed_addr #2 {
+.entry:
+  %tmp100 = fcmp ogt float %arg10, 0.25
+  br i1 %tmp100, label %if, label %endif
+if:
+  %tmp101 = fadd float %arg10, 0.125
+  br label %endif
+endif:
+  %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 false, i1 false)
+  ret void
+}
+
+define dllexport amdgpu_hs void @_amdgpu_hs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, float %arg10) local_unnamed_addr #2 {
+.entry:
+  %tmp100 = fcmp ogt float %arg10, 0.25
+  br i1 %tmp100, label %if, label %endif
+if:
+  %tmp101 = fadd float %arg10, 0.125
+  br label %endif
+endif:
+  %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 false, i1 false)
+  ret void
+}
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, float %arg10) local_unnamed_addr #2 {
+.entry:
+  %tmp100 = fcmp ogt float %arg10, 0.25
+  br i1 %tmp100, label %if, label %endif
+if:
+  %tmp101 = fadd float %arg10, 0.125
+  br label %endif
+endif:
+  %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 false, i1 false)
+  ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #2
+
+attributes #0 = { nounwind "InitialPSInputAddr"="2" "target-features"="+wavefrontsize64" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind "target-features"="+wavefrontsize32" }
+attributes #3 = { nounwind readonly }
+
+!amdgpu.pal.metadata = !{!8}
+
+!6 = !{}
+!8 = !{i32 268435482, i32 1, i32 268435488, i32 -1, i32 268435480, i32 -322237066, i32 268435481, i32 717283096, i32 268435538, i32 4096, i32 268435539, i32 8192, i32 11338, i32 53215232, i32 11339, i32 10, i32 41411, i32 4, i32 41393, i32 0, i32 41479, i32 0, i32 41476, i32 17301504, i32 41478, i32 1087, i32 41721, i32 45, i32 41633, i32 0, i32 41702, i32 0, i32 41653, i32 0, i32 41657, i32 0, i32 41661, i32 0, i32 41665, i32 0, i32 41645, i32 0, i32 41750, i32 14, i32 268435528, i32 0, i32 268435493, i32 0, i32 268435500, i32 0, i32 268435536, i32 0, i32 11274, i32 2883584, i32 11275, i32 4, i32 41412, i32 0, i32 41413, i32 4, i32 41400, i32 16777216, i32 41398, i32 1, i32 41395, i32 0, i32 41396, i32 0, i32 41397, i32 0, i32 41619, i32 100794764, i32 41475, i32 16, i32 41103, i32 15, i32 268435485, i32 0, i32 268435529, i32 0, i32 268435494, i32 0, i32 268435501, i32 0, i32 41685, i32 0, i32 268435460, i32 -431267536, i32 268435461, i32 -366377628, i32 268435476, i32 352863062, i32 268435477, i32 1678737839, i32 268435532, i32 1, i32 41642, i32 127, i32 11343, i32 268435459, i32 11344, i32 268435460, i32 11340, i32 268435456, i32 11342, i32 0, i32 41361, i32 0, i32 11276, i32 268435456}

Modified: llvm/trunk/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll?rev=363946&r1=363945&r2=363946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll Thu Jun 20 09:29:40 2019
@@ -1,68 +1,130 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s
-; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=CHECK-O0
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W32
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64
+; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
 
 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
 
-; CHECK-LABEL: mubuf_vgpr
-; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-; CHECK: [[LOOPBB:BB[0-9]+_[0-9]+]]:
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
-; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
-; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
-; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; CHECK: s_waitcnt vmcnt(0)
-; CHECK: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
-; CHECK: s_xor_b64 exec, exec, [[CMP]]
-; CHECK: s_cbranch_execnz [[LOOPBB]]
-; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
-; CHECK: v_mov_b32_e32 v0, [[RES]]
+; W64-LABEL: mubuf_vgpr
+; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
+; W64: [[LOOPBB:BB[0-9]+_[0-9]+]]:
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
+; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
+; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
+; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
+; W64: s_waitcnt vmcnt(0)
+; W64: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: s_xor_b64 exec, exec, [[CMP]]
+; W64: s_cbranch_execnz [[LOOPBB]]
+; W64: s_mov_b64 exec, [[SAVEEXEC]]
+; W64: v_mov_b32_e32 v0, [[RES]]
+
+; W32-LABEL: mubuf_vgpr
+; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
+; W32: [[LOOPBB:BB[0-9]+_[0-9]+]]:
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
+; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
+; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
+; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
+; W32: s_waitcnt vmcnt(0)
+; W32: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
+; W32: s_cbranch_execnz [[LOOPBB]]
+; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
+; W32: v_mov_b32_e32 v0, [[RES]]
+
 define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
   %call = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i1 zeroext false, i1 zeroext false) #1
   ret float %call
 }
 
-; CHECK-LABEL: mubuf_vgpr_adjacent_in_block
 
-; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-; CHECK: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
-; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
-; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
-; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; CHECK: s_waitcnt vmcnt(0)
-; CHECK: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
-; CHECK: s_xor_b64 exec, exec, [[CMP]]
-; CHECK: s_cbranch_execnz [[LOOPBB0]]
+; W64-LABEL: mubuf_vgpr_adjacent_in_block
+
+; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
+; W64: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
+; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
+; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
+; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
+; W64: s_waitcnt vmcnt(0)
+; W64: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: s_xor_b64 exec, exec, [[CMP]]
+; W64: s_cbranch_execnz [[LOOPBB0]]
 
-; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
+; W64: s_mov_b64 exec, [[SAVEEXEC]]
 ; FIXME: redundant s_mov
-; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
+; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
 
-; CHECK: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
-; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
-; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
-; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; CHECK: s_waitcnt vmcnt(0)
-; CHECK: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
-; CHECK: s_xor_b64 exec, exec, [[CMP]]
-; CHECK: s_cbranch_execnz [[LOOPBB1]]
-
-; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
-; CHECK-DAG: global_store_dword v[9:10], [[RES0]], off
-; CHECK-DAG: global_store_dword v[11:12], [[RES1]], off
+; W64: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
+; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
+; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
+; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
+; W64: s_waitcnt vmcnt(0)
+; W64: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: s_xor_b64 exec, exec, [[CMP]]
+; W64: s_cbranch_execnz [[LOOPBB1]]
+
+; W64: s_mov_b64 exec, [[SAVEEXEC]]
+; W64-DAG: global_store_dword v[9:10], [[RES0]], off
+; W64-DAG: global_store_dword v[11:12], [[RES1]], off
+
+
+; W32-LABEL: mubuf_vgpr_adjacent_in_block
+
+; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
+; W32: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
+; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
+; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
+; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
+; W32: s_waitcnt vmcnt(0)
+; W32: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
+; W32: s_cbranch_execnz [[LOOPBB0]]
+
+; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
+; FIXME: redundant s_mov
+; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
+
+; W32: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
+; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
+; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
+; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
+; W32: s_waitcnt vmcnt(0)
+; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
+; W32: s_cbranch_execnz [[LOOPBB1]]
+
+; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
+; W32-DAG: global_store_dword v[9:10], [[RES0]], off
+; W32-DAG: global_store_dword v[11:12], [[RES1]], off
 
 define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %out0, float addrspace(1)* %out1) #0 {
 entry:
@@ -73,141 +135,184 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: mubuf_vgpr_outside_entry
 
-; CHECK-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
-; CHECK-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
+; W64-LABEL: mubuf_vgpr_outside_entry
+
+; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
+; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
+
+; W64: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
+; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
+; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
+; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
+; W64: s_waitcnt vmcnt(0)
+; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: s_xor_b64 exec, exec, [[CMP]]
+; W64: s_cbranch_execnz [[LOOPBB0]]
+
+; W64: s_mov_b64 exec, [[SAVEEXEC]]
+; W64: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
+
+; W64: BB{{[0-9]+_[0-9]+}}:
+; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
+; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
+
+; W64: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
+; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
+; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
+; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
+; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
+; W64: s_waitcnt vmcnt(0)
+; W64: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: s_xor_b64 exec, exec, [[CMP]]
+; W64: s_cbranch_execnz [[LOOPBB1]]
+
+; W64: s_mov_b64 exec, [[SAVEEXEC]]
+
+; W64: [[TERMBB]]:
+; W64: global_store_dword v[11:12], [[RES]], off
+
+
+; W32-LABEL: mubuf_vgpr_outside_entry
+
+; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
+; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
+
+; W32: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
+; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
+; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
+; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
+; W32: s_waitcnt vmcnt(0)
+; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
+; W32: s_cbranch_execnz [[LOOPBB0]]
+
+; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
+; W32: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
+
+; W32: BB{{[0-9]+_[0-9]+}}:
+; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
+; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
+
+; W32: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
+; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
+; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
+; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
+; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
+; W32: s_waitcnt vmcnt(0)
+; W32: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
+; W32: s_cbranch_execnz [[LOOPBB1]]
 
-; CHECK: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
-; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
-; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
-; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; CHECK: s_waitcnt vmcnt(0)
-; CHECK: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
-; CHECK: s_xor_b64 exec, exec, [[CMP]]
-; CHECK: s_cbranch_execnz [[LOOPBB0]]
-
-; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
-; CHECK: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
-
-; CHECK: BB{{[0-9]+_[0-9]+}}:
-; CHECK-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
-; CHECK-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-
-; CHECK: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
-; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
-; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
-; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
-; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; CHECK: s_waitcnt vmcnt(0)
-; CHECK: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
-; CHECK: s_xor_b64 exec, exec, [[CMP]]
-; CHECK: s_cbranch_execnz [[LOOPBB1]]
+; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
 
-; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
+; W32: [[TERMBB]]:
+; W32: global_store_dword v[11:12], [[RES]], off
 
-; CHECK: [[TERMBB]]:
-; CHECK: global_store_dword v[11:12], [[RES]], off
 
 ; Confirm spills do not occur between the XOR and branch that terminate the
 ; waterfall loop BBs.
 
-; CHECK-O0-LABEL: mubuf_vgpr_outside_entry
+; W64-O0-LABEL: mubuf_vgpr_outside_entry
 
-; CHECK-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s4
-; CHECK-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], [[IDX_S]]
-; CHECK-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec
-; CHECK-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
-; CHECK-O0-DAG: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]]
-; CHECK-O0-DAG: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
-
-; CHECK-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
-; CHECK-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
-; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
-; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
-; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
-; CHECK-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
-; CHECK-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
-; CHECK-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
-; CHECK-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
-; CHECK-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
-; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
-; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload
-; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
-; CHECK-O0: s_xor_b64 exec, exec, [[CMP]]
-; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB0]]
-
-; CHECK-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
-; CHECK-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
-; CHECK-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}}
-; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
-; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill
-; CHECK-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
-
-; CHECK-O0: BB{{[0-9]+_[0-9]+}}:
-; CHECK-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec
-; CHECK-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
-; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]]
-; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
-
-; CHECK-O0: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
-; CHECK-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
-; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
-; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
-; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
-; CHECK-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
-; CHECK-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
-; CHECK-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
-; CHECK-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
-; CHECK-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
-; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
-; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload
-; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
-; CHECK-O0: s_waitcnt vmcnt(0)
-; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
-; CHECK-O0: s_xor_b64 exec, exec, [[CMP]]
-; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB1]]
-
-; CHECK-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
-; CHECK-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
-; CHECK-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}}
-; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
-; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Spill
-
-; CHECK-O0: [[TERMBB]]:
-; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Reload
-; CHECK-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off
+; W64-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s4
+; W64-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], [[IDX_S]]
+; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
+; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
+
+; W64-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
+; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
+; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
+; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
+; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
+; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
+; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
+; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
+; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
+; W64-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
+; W64-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W64-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
+; W64-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
+; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload
+; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
+; W64-O0: s_xor_b64 exec, exec, [[CMP]]
+; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB0]]
+; CHECK-O0: s_mov_b64 exec, [[SAVEEXEC]]
+; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
+; W64-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill
+; W64-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
+
+; W64-O0: BB{{[0-9]+_[0-9]+}}:
+; W64-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec
+; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
+; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]]
+; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
+
+; W64-O0: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
+; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
+; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
+; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
+; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
+; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
+; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
+; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
+; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
+; W64-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
+; W64-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W64-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
+; W64-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
+; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload
+; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
+; W64-O0: s_waitcnt vmcnt(0)
+; W64-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
+; W64-O0: s_xor_b64 exec, exec, [[CMP]]
+; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]]
+
+; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
+; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
+; W64-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}}
+; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
+; W64-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Spill
+
+; W64-O0: [[TERMBB]]:
+; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Reload
+; W64-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off
 
 define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %in, float addrspace(1)* %out) #0 {
 entry:

Added: llvm/trunk/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking-wave32.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking-wave32.mir?rev=363946&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking-wave32.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/optimize-negated-cond-exec-masking-wave32.mir Thu Jun 20 09:29:40 2019
@@ -0,0 +1,361 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=si-optimize-exec-masking-pre-ra -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN: name: negated_cond_vop2
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc_lo = S_AND_B32 $exec_lo, killed $vcc_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop3
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+    $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop2_redef_vcc1
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+# GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+# GCN-NEXT: $vcc_lo = COPY $sgpr0
+# GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, $vcc_lo, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2_redef_vcc1
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc_lo = COPY $sgpr0
+    $vcc_lo = S_AND_B32 $exec_lo, killed $vcc_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop3_redef_cmp
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+# GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+# GCN-NEXT: %2:sgpr_32 = COPY $sgpr0
+# GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_redef_cmp
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+    %2 = COPY $sgpr0
+    $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_undef_vcc
+# GCN:      $vcc_lo = S_AND_B32 $exec_lo, undef $vcc_lo, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_undef_vcc
+body:             |
+  bb.0:
+    $vcc_lo = S_AND_B32 $exec_lo, undef $vcc_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop3_imp_vcc
+# GCN:      $vcc_lo = IMPLICIT_DEF
+# GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, $vcc_lo, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_imp_vcc
+body:             |
+  bb.0:
+    $vcc_lo = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $vcc_lo, implicit $exec
+    %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+    $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop2_imp_vcc
+# GCN:      $vcc_lo = IMPLICIT_DEF
+# GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, $vcc_lo, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2_imp_vcc
+body:             |
+  bb.0:
+    $vcc_lo = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $vcc_lo, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc_lo = S_AND_B32 killed $vcc_lo, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop3_redef_sel
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+# GCN-NEXT: %1:vgpr_32 = COPY $vgpr0
+# GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+# GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_redef_sel
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %1:vgpr_32 = COPY $vgpr0
+    %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
+    $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop2_used_sel
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+# GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2_used_sel
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $vcc_lo = S_AND_B32 $exec_lo, killed $vcc_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    $vgpr0 = COPY %1
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop2_used_vcc
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+# GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+# GCN-NEXT: $sgpr0_sgpr1 = COPY $vcc
+# GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop2_used_vcc
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
+    $sgpr0_sgpr1 = COPY $vcc
+    $vcc_lo = S_AND_B32 $exec_lo, killed $vcc_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop3_sel_wrong_subreg1
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+# GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
+# GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_wrong_subreg1
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1.sub1 = IMPLICIT_DEF
+    %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
+    $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop3_sel_wrong_subreg2
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
+# GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_wrong_subreg2
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %1.sub1 = IMPLICIT_DEF
+    %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
+    $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop3_sel_right_subreg1
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_right_subreg1
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1.sub1 = IMPLICIT_DEF
+    %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub0, 1, implicit $exec
+    $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop3_sel_right_subreg2
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_right_subreg2
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %1.sub1 = IMPLICIT_DEF
+    %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub0, 1, implicit $exec
+    $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+# GCN: name: negated_cond_vop3_sel_subreg_overlap
+# GCN:      %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1.sub2:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+# GCN-NEXT: %1.sub2_sub3:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub2, 1, implicit $exec
+# GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
+# GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+---
+name:            negated_cond_vop3_sel_subreg_overlap
+body:             |
+  bb.0:
+    %0:sgpr_32 = IMPLICIT_DEF
+    %1.sub2:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    %1.sub2_sub3 = IMPLICIT_DEF
+    %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub2, 1, implicit $exec
+    $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_BRANCH %bb.0
+
+  bb.2:
+    S_ENDPGM 0
+...

Added: llvm/trunk/test/MC/AMDGPU/gfx10-vop2be-literal.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/MC/AMDGPU/gfx10-vop2be-literal.s?rev=363946&view=auto
==============================================================================
--- llvm/trunk/test/MC/AMDGPU/gfx10-vop2be-literal.s (added)
+++ llvm/trunk/test/MC/AMDGPU/gfx10-vop2be-literal.s Thu Jun 20 09:29:40 2019
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding < %s | FileCheck -check-prefix=GFX10 %s
+
+v_add_co_ci_u32_e32 v3, vcc_lo, 12345, v3, vcc_lo
+// GFX10: v_add_co_ci_u32_e32 v3, vcc_lo, 0x3039, v3, vcc_lo ; encoding: [0xff,0x06,0x06,0x50,0x39,0x30,0x00,0x00]
+
+v_cndmask_b32 v0, 12345, v1, vcc_lo
+// GFX10: v_cndmask_b32_e32 v0, 0x3039, v1, vcc_lo ; encoding: [0xff,0x02,0x00,0x02,0x39,0x30,0x00,0x00]




More information about the llvm-commits mailing list