[llvm] revive the optimiation of and/or with immediate value (PR #136169)

Thu Apr 17 10:40:08 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Baoshan (BaoshanPang)

<details>
<summary>Changes</summary>

 set a virtual register's allocation hint to another virtual register is not working for current RA implementation. It works with 'physical register':
 
```
 modified   llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -568,8 +568,10 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {

   if (NewImm != 0) {
     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
-      MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
-      MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+      if(SrcReg->getSubReg() == 0) { // get worse result with subreg. FIXME
+        MRI->setSimpleHint(Dest->getReg(), AMDGPU::FLAT_SCR);
+        MRI->setSimpleHint(SrcReg->getReg(), AMDGPU::FLAT_SCR);
+      }
       return true;
     }

```

---

Patch is 99.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136169.diff


13 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/andorbitset.ll (+28-28) 
- (modified) llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll (+42-42) 
- (modified) llvm/test/CodeGen/AMDGPU/bf16-conversions.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+19-19) 
- (modified) llvm/test/CodeGen/AMDGPU/fabs.ll (+39-39) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+191-192) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+90-90) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+54-54) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.ll (+39-39) 
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.ll (+284-286) 
- (modified) llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll (+20-20) 
- (modified) llvm/test/CodeGen/AMDGPU/permute.ll (+8-8) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 73343e1c80f33..7e1d8a01dbd5f 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -568,8 +568,10 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
 
   if (NewImm != 0) {
     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
-      MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
-      MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+      if(SrcReg->getSubReg() == 0) { // get worse result with subreg. FIXME
+        MRI->setSimpleHint(Dest->getReg(), AMDGPU::FLAT_SCR);
+        MRI->setSimpleHint(SrcReg->getReg(), AMDGPU::FLAT_SCR);
+      }
       return true;
     }
 
diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
index a60d14cd46573..0e1c324987fb2 100644
--- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
@@ -4,14 +4,14 @@
 define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_clear_msb:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s6, 0x7fffffff
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_bitset0_b32 s0, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = and i32 %in, 2147483647
   store i32 %x, ptr addrspace(1) %out
@@ -21,14 +21,14 @@ define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_set_msb(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_set_msb:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_or_b32 s4, s6, 0x80000000
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_bitset1_b32 s0, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = or i32 %in, 2147483648
   store i32 %x, ptr addrspace(1) %out
@@ -72,14 +72,14 @@ define amdgpu_kernel void @s_set_lsb(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_clear_midbit:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s6, 0xfffffeff
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_bitset0_b32 s0, 8
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = and i32 %in, 4294967039
   store i32 %x, ptr addrspace(1) %out
@@ -89,14 +89,14 @@ define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_set_midbit:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_or_b32 s4, s6, 0x100
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_bitset1_b32 s0, 8
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = or i32 %in, 256
   store i32 %x, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
index e68a2cdc0b846..6fd2eb7b61042 100644
--- a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
@@ -4,14 +4,14 @@
 define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_or_to_orn2:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_or_b32 s4, s6, 0xffffffcd
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_orn2_b32 s0, s0, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = or i32 %in, -51
   store i32 %x, ptr addrspace(1) %out
@@ -21,14 +21,14 @@ define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_or_to_orn2_imm0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_or_b32 s4, s6, 0xffffffcd
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_orn2_b32 s0, s0, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = or i32 -51, %in
   store i32 %x, ptr addrspace(1) %out
@@ -38,14 +38,14 @@ define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_and_to_andn2:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s6, 0xffffffcd
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_andn2_b32 s0, s0, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = and i32 %in, -51
   store i32 %x, ptr addrspace(1) %out
@@ -55,14 +55,14 @@ define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_and_to_andn2_imm0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s6, 0xffffffcd
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_andn2_b32 s0, s0, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = and i32 -51, %in
   store i32 %x, ptr addrspace(1) %out
@@ -72,14 +72,14 @@ define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_xor_to_xnor:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s4, s6, 0xffffffcd
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_xnor_b32 s0, s0, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = xor i32 %in, -51
   store i32 %x, ptr addrspace(1) %out
@@ -89,14 +89,14 @@ define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_xor_to_xnor_imm0(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_xor_to_xnor_imm0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s4, s6, 0xffffffcd
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_xnor_b32 s0, s0, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
   %x = xor i32 -51, %in
   store i32 %x, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index a597faa028f22..4694c3ebac3cd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -55,23 +55,23 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
 define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
 ; GFX-942-LABEL: v_test_cvt_v2f32_v2bf16_s:
 ; GFX-942:       ; %bb.0:
-; GFX-942-NEXT:    s_bfe_u32 s2, s1, 0x10010
-; GFX-942-NEXT:    s_add_i32 s2, s2, s1
-; GFX-942-NEXT:    s_or_b32 s4, s1, 0x400000
+; GFX-942-NEXT:    s_bfe_u32 s3, s1, 0x10010
+; GFX-942-NEXT:    s_add_i32 s3, s3, s1
+; GFX-942-NEXT:    s_or_b32 s2, s1, 0x400000
+; GFX-942-NEXT:    s_addk_i32 s3, 0x7fff
+; GFX-942-NEXT:    v_cmp_u_f32_e64 s[4:5], s1, s1
+; GFX-942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX-942-NEXT:    s_cselect_b32 s1, s2, s3
+; GFX-942-NEXT:    s_bfe_u32 s2, s0, 0x10010
+; GFX-942-NEXT:    s_add_i32 s2, s2, s0
+; GFX-942-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX-942-NEXT:    s_or_b32 s1, s0, 0x400000
 ; GFX-942-NEXT:    s_add_i32 s5, s2, 0x7fff
-; GFX-942-NEXT:    v_cmp_u_f32_e64 s[2:3], s1, s1
+; GFX-942-NEXT:    v_cmp_u_f32_e64 s[2:3], s0, s0
 ; GFX-942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
-; GFX-942-NEXT:    s_cselect_b32 s1, s4, s5
-; GFX-942-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX-942-NEXT:    s_bfe_u32 s1, s0, 0x10010
-; GFX-942-NEXT:    s_add_i32 s1, s1, s0
-; GFX-942-NEXT:    s_or_b32 s3, s0, 0x400000
-; GFX-942-NEXT:    s_add_i32 s4, s1, 0x7fff
-; GFX-942-NEXT:    v_cmp_u_f32_e64 s[0:1], s0, s0
-; GFX-942-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX-942-NEXT:    s_cselect_b32 s0, s3, s4
+; GFX-942-NEXT:    s_cselect_b32 s0, s1, s5
 ; GFX-942-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX-942-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX-942-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
 ; GFX-942-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX-942-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 9fcfbba6fb235..6627aea4c0946 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1547,30 +1547,30 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; VI-LABEL: v_cttz_i16_sel_eq_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 1
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s0, s6, 1
+; VI-NEXT:    s_addc_u32 s1, s7, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_readfirstlane_b32 s2, v2
+; VI-NEXT:    v_readfirstlane_b32 s0, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s3, v0
-; VI-NEXT:    s_lshl_b32 s2, s2, 8
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_or_b32 s3, s2, 0x10000
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_ff1_i32_b32 s3, s3
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-NEXT:    s_cselect_b32 s2, s3, 0xffff
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_readfirstlane_b32 s1, v0
+; VI-NEXT:    s_lshl_b32 s0, s0, 8
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_or_b32 s1, s0, 0x10000
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_ff1_i32_b32 s1, s1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_cselect_b32 s0, s1, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 6bcb086944c91..ee6619e2eb659 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -9,25 +9,25 @@
 define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_fabsf_fn_free:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitset0_b32 s4, 31
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_bitset0_b32 s0, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fabsf_fn_free:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_bitset0_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_bitset0_b32 s0, 31
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %bc= bitcast i32 %in to float
@@ -39,25 +39,25 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: s_fabsf_free:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitset0_b32 s4, 31
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_bitset0_b32 s0, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fabsf_free:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_bitset0_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_bitset0_b32 s0, 31
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %bc= bitcast i32 %in to float
@@ -69,25 +69,25 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
 ; SI-LABEL: s_fabsf_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitset0_b32 s4, 31
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_bitset0_b32 s0, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fabsf_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_bitset0_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_bitset0_b32 s0, 31
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %fabs = call float @llvm.fabs.f32(float %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index b4b9c2d3e0135..26af32f779783 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -276,62 +276,62 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
 define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) {
 ; SI-LABEL: s_test_copysign_f16_neg1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_or_b32 s4, s6, 0x8000
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_bitset1_b32 s0, 15
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_copysign_f16_neg1:
 ; VI:       ; %bb.0:
-...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/136169