[llvm] r358879 - [AMDGPU] Regenerate uitofp i8 to float conversion tests.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 22 03:19:09 PDT 2019
Author: rksimon
Date: Mon Apr 22 03:19:09 2019
New Revision: 358879
URL: http://llvm.org/viewvc/llvm-project?rev=358879&view=rev
Log:
[AMDGPU] Regenerate uitofp i8 to float conversion tests.
Prep work for D60462
Modified:
llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
Modified: llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll?rev=358879&r1=358878&r2=358879&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll Mon Apr 22 03:19:09 2019
@@ -1,16 +1,42 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=GCN,SI
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=GCN,VI
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
-; GCN-LABEL: {{^}}load_i8_to_f32:
-; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]],
-; GCN-NOT: bfe
-; GCN-NOT: lshr
-; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
-; GCN: buffer_store_dword [[CONV]],
define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_i8_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_i8_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
%load = load i8, i8 addrspace(1)* %gep, align 1
@@ -19,12 +45,42 @@ define amdgpu_kernel void @load_i8_to_f3
ret void
}
-; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
-; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]]
-; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
-; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v2i8_to_v2f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v2i8_to_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
%load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
@@ -33,15 +89,45 @@ define amdgpu_kernel void @load_v2i8_to_
ret void
}
-; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
-; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
-; GCN-NOT: v_cvt_f32_ubyte3_e32
-; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
-; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[MDRESULT:[0-9]+]], [[VAL]]
-; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[MDRESULT]]{{\]}},
-; VI: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v3i8_to_v3f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
+; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v3i8_to_v3f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
%load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
@@ -50,16 +136,46 @@ define amdgpu_kernel void @load_v3i8_to_
ret void
}
-; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
-; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]]
-; GCN-NOT: bfe
-; GCN-NOT: lshr
-; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
-; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
-; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
-; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v4i8_to_v4f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v4i8_to_v4f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
%load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
@@ -72,20 +188,71 @@ define amdgpu_kernel void @load_v4i8_to_
; position in the word for the component.
; FIXME: Packing bytes
-; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
-; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]]
-; GCN-DAG: v_lshlrev_b32
-; GCN-DAG: v_or_b32
-; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
-; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
-; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
-; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
-
-; GCN: buffer_store_dwordx4
define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v4i8_to_v4f32_unaligned:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
+; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
+; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v4i8_to_v4f32_unaligned:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[6:7]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_e32 v2, v2, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
%load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
@@ -96,31 +263,85 @@ define amdgpu_kernel void @load_v4i8_to_
; FIXME: Need to handle non-uniform case for function below (load without gep).
; Instructions still emitted to repack bytes for add use.
-
-; GCN-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
-; GCN: {{buffer|flat}}_load_dword
-; GCN-DAG: v_cvt_f32_ubyte0_e32
-; GCN-DAG: v_cvt_f32_ubyte1_e32
-; GCN-DAG: v_cvt_f32_ubyte2_e32
-; GCN-DAG: v_cvt_f32_ubyte3_e32
-
-; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
-
-; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
-; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
-; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
-; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00,
-; SI-DAG: v_add_i32
-
-; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffffff00,
-; VI-DAG: v_add_u16_e32
-; VI-DAG: v_add_u16_e32
-
-; GCN-DAG: {{buffer|flat}}_store_dwordx4
-; GCN-DAG: {{buffer|flat}}_store_dword
-
-; GCN: s_endpgm
define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v4i8_to_v4f32_2_uses:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_movk_i32 s12, 0x900
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_movk_i32 s13, 0xff
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1
+; SI-NEXT: v_and_b32_e32 v6, 0xff00, v1
+; SI-NEXT: v_add_i32_e32 v7, vcc, 9, v1
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v6
+; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT: v_add_i32_e32 v6, vcc, s12, v6
+; SI-NEXT: v_and_b32_e32 v7, s13, v7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v1, vcc, s12, v5
+; SI-NEXT: v_and_b32_e32 v2, s13, v4
+; SI-NEXT: v_or_b32_e32 v0, v7, v6
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v4i8_to_v4f32_2_uses:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_movk_i32 s8, 0x900
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v5, v[0:1]
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: v_mov_b32_e32 v4, 9
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
+; VI-NEXT: v_add_u16_e32 v8, 9, v5
+; VI-NEXT: v_add_u16_e32 v0, s8, v7
+; VI-NEXT: v_add_u16_e32 v1, s8, v1
+; VI-NEXT: v_add_u16_sdwa v2, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -132,9 +353,107 @@ define amdgpu_kernel void @load_v4i8_to_
}
; Make sure this doesn't crash.
-; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
-; GCN: s_endpgm
define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v7i8_to_v7f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
+; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
+; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
+; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4
+; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5
+; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
+; SI-NEXT: v_or_b32_e32 v2, v2, v4
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:24
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v4, v3, v6
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v7i8_to_v7f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v10, v[4:5]
+; VI-NEXT: flat_load_ubyte v11, v[2:3]
+; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v8, vcc, 6, v0
+; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[8:9]
+; VI-NEXT: flat_load_ubyte v7, v[6:7]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
+; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11
+; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; VI-NEXT: v_or_b32_e32 v0, v3, v0
+; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v1
+; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_or_b32_e32 v4, v4, v7
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; VI-NEXT: v_or_b32_e32 v4, v4, v5
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
%load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
@@ -143,23 +462,56 @@ define amdgpu_kernel void @load_v7i8_to_
ret void
}
-; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
-; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
-; GCN-NOT: bfe
-; GCN-NOT: lshr
-; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
-; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
-; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
-; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
-; GCN-NOT: bfe
-; GCN-NOT: lshr
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v8i8_to_v8f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v8i8_to_v8f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
%load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
@@ -168,12 +520,42 @@ define amdgpu_kernel void @load_v8i8_to_
ret void
}
-; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
-; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]],
-; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
-; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
-; GCN: buffer_store_dword [[CONV]],
define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: i8_zext_inreg_i32_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: i8_zext_inreg_i32_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%load = load i32, i32 addrspace(1)* %gep, align 4
@@ -184,8 +566,40 @@ define amdgpu_kernel void @i8_zext_inreg
ret void
}
-; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: i8_zext_inreg_hi1_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: i8_zext_inreg_hi1_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%load = load i32, i32 addrspace(1)* %gep, align 4
@@ -198,8 +612,38 @@ define amdgpu_kernel void @i8_zext_inreg
; We don't get these ones because of the zext, but instcombine removes
; them so it shouldn't really matter.
-; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: i8_zext_i32_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: i8_zext_i32_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
%load = load i8, i8 addrspace(1)* %gep, align 1
@@ -209,8 +653,71 @@ define amdgpu_kernel void @i8_zext_i32_t
ret void
}
-; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
+; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
+; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[6:7]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v2, v[2:3]
+; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_e32 v4, v2, v0
+; VI-NEXT: v_or_b32_sdwa v0, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v0, v4
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
%load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
@@ -220,12 +727,40 @@ define amdgpu_kernel void @v4i8_zext_v4i
ret void
}
-; GCN-LABEL: {{^}}extract_byte0_to_f32:
-; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
-; GCN-NOT: [[VAL]]
-; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: extract_byte0_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: extract_byte0_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%val = load i32, i32 addrspace(1)* %gep
@@ -235,12 +770,40 @@ define amdgpu_kernel void @extract_byte0
ret void
}
-; GCN-LABEL: {{^}}extract_byte1_to_f32:
-; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
-; GCN-NOT: [[VAL]]
-; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: extract_byte1_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: extract_byte1_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%val = load i32, i32 addrspace(1)* %gep
@@ -251,12 +814,40 @@ define amdgpu_kernel void @extract_byte1
ret void
}
-; GCN-LABEL: {{^}}extract_byte2_to_f32:
-; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
-; GCN-NOT: [[VAL]]
-; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: extract_byte2_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: extract_byte2_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%val = load i32, i32 addrspace(1)* %gep
@@ -267,12 +858,40 @@ define amdgpu_kernel void @extract_byte2
ret void
}
-; GCN-LABEL: {{^}}extract_byte3_to_f32:
-; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
-; GCN-NOT: [[VAL]]
-; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: extract_byte3_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: extract_byte3_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%val = load i32, i32 addrspace(1)* %gep
@@ -283,13 +902,46 @@ define amdgpu_kernel void @extract_byte3
ret void
}
-; GCN-LABEL: {{^}}cvt_ubyte0_or_multiuse:
-; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]],
-; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x80000001, [[LOADREG]]
-; GCN-DAG: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[OR]]
-; GCN: v_add_f32_e32 [[RES:v[0-9]+]], [[OR]], [[CONV]]
-; GCN: buffer_store_dword [[RES]],
define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
+; SI-LABEL: cvt_ubyte0_or_multiuse:
+; SI: ; %bb.0: ; %bb
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: cvt_ubyte0_or_multiuse:
+; VI: ; %bb.0: ; %bb
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_mov_b32 s4, s2
+; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid
More information about the llvm-commits
mailing list