[llvm] [NFC][AMDGPU] Regenerate CHECK lines in commute-compares.ll. (PR #140076)
Harrison Hao via llvm-commits
llvm-commits at lists.llvm.org
Thu May 15 08:07:26 PDT 2025
https://github.com/harrisonGPU created https://github.com/llvm/llvm-project/pull/140076
None
>From 35a78a3925e56c245131bc7f37a959f1ca4d25a7 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 15 May 2025 15:05:48 +0000
Subject: [PATCH] [NFC][AMDGPU] Regenerate CHECK lines in commute-compares.ll.
---
llvm/test/CodeGen/AMDGPU/commute-compares.ll | 979 ++++++++++++++++---
1 file changed, 865 insertions(+), 114 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index fcb871cedd0cb..ae8080cf9f06a 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -6,9 +7,23 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
; i32 compares
; --------------------------------------------------------------------------------
-; GCN-LABEL: {{^}}commute_eq_64_i32:
-; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_eq_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_eq_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -19,9 +34,23 @@ define amdgpu_kernel void @commute_eq_64_i32(ptr addrspace(1) %out, ptr addrspac
ret void
}
-; GCN-LABEL: {{^}}commute_ne_64_i32:
-; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_ne_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ne_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -33,10 +62,24 @@ define amdgpu_kernel void @commute_ne_64_i32(ptr addrspace(1) %out, ptr addrspac
}
; FIXME: Why isn't this being folded as a constant?
-; GCN-LABEL: {{^}}commute_ne_litk_i32:
-; GCN: s_movk_i32 [[K:s[0-9]+]], 0x3039
-; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
define amdgpu_kernel void @commute_ne_litk_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ne_litk_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_movk_i32 s4, 0x3039
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s4, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -47,9 +90,23 @@ define amdgpu_kernel void @commute_ne_litk_i32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ugt_64_i32:
-; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_ugt_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -60,9 +117,23 @@ define amdgpu_kernel void @commute_ugt_64_i32(ptr addrspace(1) %out, ptr addrspa
ret void
}
-; GCN-LABEL: {{^}}commute_uge_64_i32:
-; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
define amdgpu_kernel void @commute_uge_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 63, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -73,9 +144,23 @@ define amdgpu_kernel void @commute_uge_64_i32(ptr addrspace(1) %out, ptr addrspa
ret void
}
-; GCN-LABEL: {{^}}commute_ult_64_i32:
-; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_ult_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -86,9 +171,23 @@ define amdgpu_kernel void @commute_ult_64_i32(ptr addrspace(1) %out, ptr addrspa
ret void
}
-; GCN-LABEL: {{^}}commute_ule_63_i32:
-; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
define amdgpu_kernel void @commute_ule_63_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_63_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -99,10 +198,24 @@ define amdgpu_kernel void @commute_ule_63_i32(ptr addrspace(1) %out, ptr addrspa
ret void
}
-; GCN-LABEL: {{^}}commute_ule_64_i32:
-; GCN: s_movk_i32 [[K:s[0-9]+]], 0x41{{$}}
-; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
define amdgpu_kernel void @commute_ule_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_64_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_movk_i32 s4, 0x41
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s4, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -113,9 +226,23 @@ define amdgpu_kernel void @commute_ule_64_i32(ptr addrspace(1) %out, ptr addrspa
ret void
}
-; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
-; GCN: v_ashrrev_i32_e32 v2, 31, v2
define amdgpu_kernel void @commute_sgt_neg1_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sgt_neg1_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_not_b32_e32 v2, v2
+; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -126,9 +253,23 @@ define amdgpu_kernel void @commute_sgt_neg1_i32(ptr addrspace(1) %out, ptr addrs
ret void
}
-; GCN-LABEL: {{^}}commute_sge_neg2_i32:
-; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
define amdgpu_kernel void @commute_sge_neg2_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sge_neg2_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -3, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -139,9 +280,23 @@ define amdgpu_kernel void @commute_sge_neg2_i32(ptr addrspace(1) %out, ptr addrs
ret void
}
-; GCN-LABEL: {{^}}commute_slt_neg16_i32:
-; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
define amdgpu_kernel void @commute_slt_neg16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_slt_neg16_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, -16, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -152,9 +307,23 @@ define amdgpu_kernel void @commute_slt_neg16_i32(ptr addrspace(1) %out, ptr addr
ret void
}
-; GCN-LABEL: {{^}}commute_sle_5_i32:
-; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
define amdgpu_kernel void @commute_sle_5_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sle_5_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 6, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -169,9 +338,24 @@ define amdgpu_kernel void @commute_sle_5_i32(ptr addrspace(1) %out, ptr addrspac
; i64 compares
; --------------------------------------------------------------------------------
-; GCN-LABEL: {{^}}commute_eq_64_i64:
-; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_eq_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_eq_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -182,9 +366,24 @@ define amdgpu_kernel void @commute_eq_64_i64(ptr addrspace(1) %out, ptr addrspac
ret void
}
-; GCN-LABEL: {{^}}commute_ne_64_i64:
-; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ne_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ne_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -195,9 +394,24 @@ define amdgpu_kernel void @commute_ne_64_i64(ptr addrspace(1) %out, ptr addrspac
ret void
}
-; GCN-LABEL: {{^}}commute_ugt_64_i64:
-; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ugt_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -208,9 +422,24 @@ define amdgpu_kernel void @commute_ugt_64_i64(ptr addrspace(1) %out, ptr addrspa
ret void
}
-; GCN-LABEL: {{^}}commute_uge_64_i64:
-; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_uge_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -221,9 +450,24 @@ define amdgpu_kernel void @commute_uge_64_i64(ptr addrspace(1) %out, ptr addrspa
ret void
}
-; GCN-LABEL: {{^}}commute_ult_64_i64:
-; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ult_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -234,9 +478,24 @@ define amdgpu_kernel void @commute_ult_64_i64(ptr addrspace(1) %out, ptr addrspa
ret void
}
-; GCN-LABEL: {{^}}commute_ule_63_i64:
-; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_63_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -249,10 +508,25 @@ define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspa
; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
-; GCN-LABEL: {{^}}commute_ule_64_i64:
-; GCN: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x41
-; GCN: v_cmp_gt_u64_e32 vcc, [[K]], v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_64_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[4:5], 0x41
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -263,9 +537,24 @@ define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspa
ret void
}
-; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
-; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sgt_neg1_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -276,9 +565,24 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs
ret void
}
-; GCN-LABEL: {{^}}commute_sge_neg2_i64:
-; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_sge_neg2_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sge_neg2_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -3, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -289,9 +593,24 @@ define amdgpu_kernel void @commute_sge_neg2_i64(ptr addrspace(1) %out, ptr addrs
ret void
}
-; GCN-LABEL: {{^}}commute_slt_neg16_i64:
-; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_slt_neg16_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_slt_neg16_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc, -16, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -302,9 +621,24 @@ define amdgpu_kernel void @commute_slt_neg16_i64(ptr addrspace(1) %out, ptr addr
ret void
}
-; GCN-LABEL: {{^}}commute_sle_5_i64:
-; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_sle_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_sle_5_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 6, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -319,10 +653,23 @@ define amdgpu_kernel void @commute_sle_5_i64(ptr addrspace(1) %out, ptr addrspac
; f32 compares
; --------------------------------------------------------------------------------
-
-; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
-; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_oeq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oeq_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -333,10 +680,23 @@ define amdgpu_kernel void @commute_oeq_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-
-; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
-; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ogt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ogt_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -347,9 +707,23 @@ define amdgpu_kernel void @commute_ogt_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_oge_2.0_f32:
-; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_oge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oge_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_le_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -360,9 +734,23 @@ define amdgpu_kernel void @commute_oge_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_olt_2.0_f32:
-; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_olt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_olt_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -373,9 +761,23 @@ define amdgpu_kernel void @commute_olt_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ole_2.0_f32:
-; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ole_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ole_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ge_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -386,9 +788,23 @@ define amdgpu_kernel void @commute_ole_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_one_2.0_f32:
-; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_one_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_one_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lg_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -399,9 +815,23 @@ define amdgpu_kernel void @commute_one_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ord_2.0_f32:
-; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
define amdgpu_kernel void @commute_ord_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ord_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_o_f32_e32 vcc, v2, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -412,9 +842,23 @@ define amdgpu_kernel void @commute_ord_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
-; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ueq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ueq_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -425,9 +869,23 @@ define amdgpu_kernel void @commute_ueq_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
-; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ugt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nge_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -438,9 +896,23 @@ define amdgpu_kernel void @commute_ugt_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_uge_2.0_f32:
-; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_uge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -451,9 +923,23 @@ define amdgpu_kernel void @commute_uge_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ult_2.0_f32:
-; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ult_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nle_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -464,9 +950,23 @@ define amdgpu_kernel void @commute_ult_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ule_2.0_f32:
-; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_ule_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -477,9 +977,23 @@ define amdgpu_kernel void @commute_ule_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_une_2.0_f32:
-; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
define amdgpu_kernel void @commute_une_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_une_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 2.0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -490,9 +1004,23 @@ define amdgpu_kernel void @commute_une_2.0_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_uno_2.0_f32:
-; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
define amdgpu_kernel void @commute_uno_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uno_2.0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -507,10 +1035,24 @@ define amdgpu_kernel void @commute_uno_2.0_f32(ptr addrspace(1) %out, ptr addrsp
; f64 compares
; --------------------------------------------------------------------------------
-
-; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
-; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_oeq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oeq_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_eq_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -521,10 +1063,24 @@ define amdgpu_kernel void @commute_oeq_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-
-; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
-; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ogt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ogt_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -535,9 +1091,24 @@ define amdgpu_kernel void @commute_ogt_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_oge_2.0_f64:
-; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_oge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_oge_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_le_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -548,9 +1119,24 @@ define amdgpu_kernel void @commute_oge_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_olt_2.0_f64:
-; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_olt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_olt_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -561,9 +1147,24 @@ define amdgpu_kernel void @commute_olt_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ole_2.0_f64:
-; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ole_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ole_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ge_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -574,9 +1175,24 @@ define amdgpu_kernel void @commute_ole_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_one_2.0_f64:
-; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_one_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_one_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_lg_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -587,9 +1203,24 @@ define amdgpu_kernel void @commute_one_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ord_2.0_f64:
-; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
define amdgpu_kernel void @commute_ord_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ord_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_o_f64_e32 vcc, v[3:4], v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -600,9 +1231,24 @@ define amdgpu_kernel void @commute_ord_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
-; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ueq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ueq_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nlg_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -613,9 +1259,24 @@ define amdgpu_kernel void @commute_ueq_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
-; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ugt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ugt_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nge_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -626,9 +1287,24 @@ define amdgpu_kernel void @commute_ugt_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_uge_2.0_f64:
-; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_uge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uge_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ngt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -639,9 +1315,24 @@ define amdgpu_kernel void @commute_uge_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ult_2.0_f64:
-; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ult_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ult_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nle_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -652,9 +1343,24 @@ define amdgpu_kernel void @commute_ult_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_ule_2.0_f64:
-; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_ule_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_ule_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -665,9 +1371,24 @@ define amdgpu_kernel void @commute_ule_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_une_2.0_f64:
-; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @commute_une_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_une_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_neq_f64_e32 vcc, 2.0, v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -678,9 +1399,24 @@ define amdgpu_kernel void @commute_une_2.0_f64(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}commute_uno_2.0_f64:
-; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: commute_uno_2.0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[3:4], v[3:4]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@@ -696,12 +1432,27 @@ define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrsp
; Without commuting the frame index in the pre-regalloc run of
; SIShrinkInstructions, this was using the VOP3 compare.
-; GCN-LABEL: {{^}}commute_frameindex:
-; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
-
-; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
-; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 {
+; GCN-LABEL: commute_frameindex:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: s_mov_b32 s15, 0xe8f000
+; GCN-NEXT: s_add_u32 s12, s12, s11
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
entry:
%stack0 = alloca i32, addrspace(5)
%ptr0 = load volatile ptr addrspace(5), ptr addrspace(1) poison
More information about the llvm-commits
mailing list