[llvm] revive the optimiation of and/or with immediate value (PR #136169)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 17 10:40:08 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Baoshan (BaoshanPang)
<details>
<summary>Changes</summary>
set a virtual register's allocation hint to another virtual register is not working for current RA implementation. It works with 'physical register':
```
modified llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -568,8 +568,10 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
if (NewImm != 0) {
if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
- MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
- MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+ if(SrcReg->getSubReg() == 0) { // get worse result with subreg. FIXME
+ MRI->setSimpleHint(Dest->getReg(), AMDGPU::FLAT_SCR);
+ MRI->setSimpleHint(SrcReg->getReg(), AMDGPU::FLAT_SCR);
+ }
return true;
}
```
---
Patch is 99.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136169.diff
13 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/andorbitset.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/bf16-conversions.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.ll (+39-39)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+191-192)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+90-90)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+54-54)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.ll (+39-39)
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.ll (+284-286)
- (modified) llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/permute.ll (+8-8)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 73343e1c80f33..7e1d8a01dbd5f 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -568,8 +568,10 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
if (NewImm != 0) {
if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
- MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
- MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+ if(SrcReg->getSubReg() == 0) { // get worse result with subreg. FIXME
+ MRI->setSimpleHint(Dest->getReg(), AMDGPU::FLAT_SCR);
+ MRI->setSimpleHint(SrcReg->getReg(), AMDGPU::FLAT_SCR);
+ }
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
index a60d14cd46573..0e1c324987fb2 100644
--- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
@@ -4,14 +4,14 @@
define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_clear_msb:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s4, s6, 0x7fffffff
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_bitset0_b32 s0, 31
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = and i32 %in, 2147483647
store i32 %x, ptr addrspace(1) %out
@@ -21,14 +21,14 @@ define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_set_msb(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_set_msb:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_or_b32 s4, s6, 0x80000000
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_bitset1_b32 s0, 31
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = or i32 %in, 2147483648
store i32 %x, ptr addrspace(1) %out
@@ -72,14 +72,14 @@ define amdgpu_kernel void @s_set_lsb(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_clear_midbit:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s4, s6, 0xfffffeff
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_bitset0_b32 s0, 8
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = and i32 %in, 4294967039
store i32 %x, ptr addrspace(1) %out
@@ -89,14 +89,14 @@ define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_set_midbit:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_or_b32 s4, s6, 0x100
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_bitset1_b32 s0, 8
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = or i32 %in, 256
store i32 %x, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
index e68a2cdc0b846..6fd2eb7b61042 100644
--- a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
@@ -4,14 +4,14 @@
define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_or_to_orn2:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_or_b32 s4, s6, 0xffffffcd
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_orn2_b32 s0, s0, 50
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = or i32 %in, -51
store i32 %x, ptr addrspace(1) %out
@@ -21,14 +21,14 @@ define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_or_to_orn2_imm0:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_or_b32 s4, s6, 0xffffffcd
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_orn2_b32 s0, s0, 50
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = or i32 -51, %in
store i32 %x, ptr addrspace(1) %out
@@ -38,14 +38,14 @@ define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_and_to_andn2:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s4, s6, 0xffffffcd
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_andn2_b32 s0, s0, 50
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = and i32 %in, -51
store i32 %x, ptr addrspace(1) %out
@@ -55,14 +55,14 @@ define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_and_to_andn2_imm0:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s4, s6, 0xffffffcd
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_andn2_b32 s0, s0, 50
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = and i32 -51, %in
store i32 %x, ptr addrspace(1) %out
@@ -72,14 +72,14 @@ define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_xor_to_xnor:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_xor_b32 s4, s6, 0xffffffcd
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_xnor_b32 s0, s0, 50
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = xor i32 %in, -51
store i32 %x, ptr addrspace(1) %out
@@ -89,14 +89,14 @@ define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_xor_to_xnor_imm0(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_xor_to_xnor_imm0:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_xor_b32 s4, s6, 0xffffffcd
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_xnor_b32 s0, s0, 50
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
%x = xor i32 -51, %in
store i32 %x, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index a597faa028f22..4694c3ebac3cd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -55,23 +55,23 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
; GFX-942-LABEL: v_test_cvt_v2f32_v2bf16_s:
; GFX-942: ; %bb.0:
-; GFX-942-NEXT: s_bfe_u32 s2, s1, 0x10010
-; GFX-942-NEXT: s_add_i32 s2, s2, s1
-; GFX-942-NEXT: s_or_b32 s4, s1, 0x400000
+; GFX-942-NEXT: s_bfe_u32 s3, s1, 0x10010
+; GFX-942-NEXT: s_add_i32 s3, s3, s1
+; GFX-942-NEXT: s_or_b32 s2, s1, 0x400000
+; GFX-942-NEXT: s_addk_i32 s3, 0x7fff
+; GFX-942-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1
+; GFX-942-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX-942-NEXT: s_cselect_b32 s1, s2, s3
+; GFX-942-NEXT: s_bfe_u32 s2, s0, 0x10010
+; GFX-942-NEXT: s_add_i32 s2, s2, s0
+; GFX-942-NEXT: s_lshr_b32 s4, s1, 16
+; GFX-942-NEXT: s_or_b32 s1, s0, 0x400000
; GFX-942-NEXT: s_add_i32 s5, s2, 0x7fff
-; GFX-942-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1
+; GFX-942-NEXT: v_cmp_u_f32_e64 s[2:3], s0, s0
; GFX-942-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX-942-NEXT: s_cselect_b32 s1, s4, s5
-; GFX-942-NEXT: s_lshr_b32 s2, s1, 16
-; GFX-942-NEXT: s_bfe_u32 s1, s0, 0x10010
-; GFX-942-NEXT: s_add_i32 s1, s1, s0
-; GFX-942-NEXT: s_or_b32 s3, s0, 0x400000
-; GFX-942-NEXT: s_add_i32 s4, s1, 0x7fff
-; GFX-942-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0
-; GFX-942-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; GFX-942-NEXT: s_cselect_b32 s0, s3, s4
+; GFX-942-NEXT: s_cselect_b32 s0, s1, s5
; GFX-942-NEXT: s_lshr_b32 s0, s0, 16
-; GFX-942-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX-942-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX-942-NEXT: v_mov_b32_e32 v0, s0
; GFX-942-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 9fcfbba6fb235..6627aea4c0946 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1547,30 +1547,30 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
;
; VI-LABEL: v_cttz_i16_sel_eq_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 1
-; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_add_u32 s0, s6, 1
+; VI-NEXT: s_addc_u32 s1, s7, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_readfirstlane_b32 s2, v2
+; VI-NEXT: v_readfirstlane_b32 s0, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s3, v0
-; VI-NEXT: s_lshl_b32 s2, s2, 8
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: s_or_b32 s3, s2, 0x10000
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_ff1_i32_b32 s3, s3
-; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cselect_b32 s2, s3, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_readfirstlane_b32 s1, v0
+; VI-NEXT: s_lshl_b32 s0, s0, 8
+; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_or_b32 s1, s0, 0x10000
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_ff1_i32_b32 s1, s1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cselect_b32 s0, s1, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 6bcb086944c91..ee6619e2eb659 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -9,25 +9,25 @@
define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_fabsf_fn_free:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_load_dword s4, s[4:5], 0xb
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitset0_b32 s4, 31
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_bitset0_b32 s0, 31
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_fn_free:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT: s_load_dword s0, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_bitset0_b32 s0, 31
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%bc= bitcast i32 %in to float
@@ -39,25 +39,25 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_fabsf_free:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_load_dword s4, s[4:5], 0xb
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitset0_b32 s4, 31
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_bitset0_b32 s0, 31
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_free:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT: s_load_dword s0, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_bitset0_b32 s0, 31
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%bc= bitcast i32 %in to float
@@ -69,25 +69,25 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
; SI-LABEL: s_fabsf_f32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_load_dword s4, s[4:5], 0xb
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitset0_b32 s4, 31
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_bitset0_b32 s0, 31
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT: s_load_dword s0, s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_bitset0_b32 s0, 31
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index b4b9c2d3e0135..26af32f779783 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -276,62 +276,62 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) {
; SI-LABEL: s_test_copysign_f16_neg1:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dword s0, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_or_b32 s4, s6, 0x8000
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_bitset1_b32 s0, 15
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_test_copysign_f16_neg1:
; VI: ; %bb.0:
-...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/136169
More information about the llvm-commits
mailing list