[llvm] [AMDGPU] Allow shinking instruction with dead sdst (PR #68028)

via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 2 13:16:23 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-globalisel

<details>
<summary>Changes</summary>

Pre-RA pass of instruction shrinking does not shrink instructions with sdst carry-out and carry-in. Instead it sets the allocation hint to VCC and lives it until after RA.

We still can shrink it before RA if the sdst is dead and carry-in is an immediate.

There are some other instructions which will not be shrunk after RA however, because now more instructions will clobber VCC, and if the carry is used after such clobber then RA has no choice but to allocate a non-VCC SGPR. The net effect seems to be positive though, in the affected tests we had 1231 _e64 and 1162 _e32 instructions. Now it is 1010 _e64 instructions and 1359 _e32 according to the diff (it does not add-up exactly because some checks are collapsed for different targets). This is ~8% improvement in shrinking. Also note that regression cases are the old targets without no-carry add/sub instructions, and many of the tests target GFX6.

---

Patch is 390.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68028.diff


19 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp (+23-15) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+246-268) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll (+80-80) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+272-272) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+168-168) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll (+8-8) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+220-220) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+246-268) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll (+76-76) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+229-229) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+179-179) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+219-222) 
- (modified) llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll (+99-99) 
- (modified) llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll (+362-362) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/med3-knownbits.ll (+4-4) 
- (added) llvm/test/CodeGen/AMDGPU/shrink-dead-sdst.mir (+12) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 4159dc694c1e037..bba533a4ca0599a 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -961,26 +961,34 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
                                                         AMDGPU::OpName::sdst);
 
       if (SDst) {
-        bool Next = false;
-
-        if (SDst->getReg() != VCCReg) {
-          if (SDst->getReg().isVirtual())
-            MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
-          Next = true;
-        }
-
         // All of the instructions with carry outs also have an SGPR input in
         // src2.
         const MachineOperand *Src2 = TII->getNamedOperand(MI,
                                                           AMDGPU::OpName::src2);
-        if (Src2 && Src2->getReg() != VCCReg) {
-          if (Src2->getReg().isVirtual())
-            MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
-          Next = true;
-        }
 
-        if (Next)
-          continue;
+        // We can shrink the instruction right now if sdst is dead anyway and
+        // carry-in is not a register. If it is a register then VOP2 form shall
+        // have it set to the same vcc register and we may end up reading an
+        // undefined vcc.
+        if (!SDst->isDead() || SDst->getReg().isPhysical() ||
+            (Src2 && Src2->isReg())) {
+          bool Next = false;
+
+          if (SDst->getReg() != VCCReg) {
+            if (SDst->getReg().isVirtual())
+              MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
+            Next = true;
+          }
+
+          if (Src2 && Src2->getReg() != VCCReg) {
+            if (Src2->getReg().isVirtual())
+              MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
+            Next = true;
+          }
+
+          if (Next)
+            continue;
+        }
       }
 
       // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index 6eed92ba1d71ccc..e0754f62208a023 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -688,12 +688,12 @@ define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 2, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v2, s[0:1], 2, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], 2, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 2, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 2, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index cded5c94edf8cc3..d28ef2ead4bb057 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -506,47 +506,44 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
+; GFX6-NEXT:    v_min_i32_e32 v9, 0, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 0x80000000, v9
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
-; GFX6-NEXT:    v_max_i32_e32 v1, v10, v1
+; GFX6-NEXT:    v_max_i32_e32 v1, v9, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x80000000, v8
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX6-NEXT:    v_max_i32_e32 v2, v8, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
@@ -1395,7 +1392,7 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v2
@@ -1423,7 +1420,7 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, 0, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX8-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i32_e32 v4, 0, v2
@@ -1736,7 +1733,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v10, v6
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
@@ -1779,7 +1776,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, 0, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, s5, v10
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX8-NEXT:    v_max_i32_e32 v6, v10, v6
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v2
@@ -1949,246 +1946,238 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v16i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s4, 1
 ; GFX6-NEXT:    v_min_i32_e32 v31, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, s4, v31
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0x80000000, v31
 ; GFX6-NEXT:    v_max_i32_e32 v16, v31, v16
-; GFX6-NEXT:    s_brev_b32 s5, -2
 ; GFX6-NEXT:    v_max_i32_e32 v31, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, s5, v31
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0x7fffffff, v31
 ; GFX6-NEXT:    v_min_i32_e32 v16, v16, v31
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v16
 ; GFX6-NEXT:    v_min_i32_e32 v16, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s4, v16
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x80000000, v16
 ; GFX6-NEXT:    v_max_i32_e32 v16, v16, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, s5, v17
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 0x7fffffff, v17
 ; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v16
 ; GFX6-NEXT:    v_min_i32_e32 v16, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s4, v16
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x80000000, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v16, v16, v18
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, s5, v17
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 0x7fffffff, v17
 ; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v16
-; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX6-NEXT:    v_bfrev_b32_e32 v18, -2
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v4
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v4
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v5
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v5
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v21
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v6
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v6
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
-; GFX6-NEXT:    buffer_load_dword v19, off, s[0:3], s32
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, 0, v7
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, 0, v7
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v18, v20
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v8
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v8
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v24
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v9
-; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v9
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v25
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v10
-; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v10
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v26
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v11
-; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v11
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v27
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v12
-; GFX6-NEXT:    v_add_i32_e32 v11, vcc, v11, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v12
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v28
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v13
-; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v13
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v29
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v14
-; GFX6-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v14
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v30
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v15
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_min_i32_e32 v18, 0, v15
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v18
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_min_i32_e32 v16, 0, v3
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x80000000, v16
+; GFX6-NEXT:    v_max_i32_e32 v17, 0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v16, v16, v19
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 0x7fffffff, v17
+; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v16
+; GFX6-NEXT:    v_min_i32_e32 v16, 0, v4
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x80000000, v16
+; GFX6-NEXT:    v_max_i32_e32 v17, 0, v4
+; GFX6-NEXT:    v_max_i32_e32 v16, v16, v20
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 0x7fffffff, v17
 ; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
+; GFX6-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
+; GFX6-NEXT:    v_min_i32_e32 v16, 0, v5
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x80000000, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, 0, v5
+; GFX6-NEXT:    v_max_i32_e32 v16, v16, v21
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x7fffffff, v18
+; GFX6-NEXT:    v_min_i32_e32 v16, v16, v18
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v6
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v6
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, v18, v22
+; GFX6-NEXT:    v_min_i32_e32 v16, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v7
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v7
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, v18, v23
+; GFX6-NEXT:    v_min_i32_e32 v16, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v8
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v8
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, v18, v24
+; GFX6-NEXT:    v_min_i32_e32 v16, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v9
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v9
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, v18, v25
+; GFX6-NEXT:    v_min_i32_e32 v16, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v10
+; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v10
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, v18, v26
+; GFX6-NEXT:    v_min_i32_e32 v16, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v11
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v11
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, v18, v27
+; GFX6-NEXT:    v_min_i32_e32 v16, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v12
+; GFX6-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v12
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, v18, v28
+; GFX6-NEXT:    v_min_i32_e32 v16, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v13
+; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v13
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, v18, v29
+; GFX6-NEXT:    v_min_i32_e32 v16, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v14
+; GFX6-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v14
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    v_max_i32_e32 v18, v18, v30
+; GFX6-NEXT:    v_min_i32_e32 v16, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v18, 0, v15
+; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; GFX6-NEXT:    v_max_i32_e32 v16, 0, v15
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, 0x80000000, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0x7fffffff, v16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_max_i32_e32 v17, v18, v17
+; GFX6-NEXT:    v_min_i32_e32 v16, v17, v16
 ; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v16i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s4, 1
 ; GFX8-NEXT:    v_min_i32_e32 v31, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_sub_u32_e32 v31, vcc, 0x80000000, v31
 ; GFX8-NEXT:    v_max_i32_e32 v16, v31, v16
-; GFX8-NEXT:    s_brev_b32 s5, -2
 ; GFX8-NEXT:    v_max_i32_e32 v31, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v31, vcc, s5, v31
+; GFX8-NEXT:    v_sub_u32_e32 v31, vcc, 0x7fffffff, v31
 ; GFX8-NEXT:    v_min_i32_e32 v16, v16, v31
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v16
 ; GFX8-NEXT:    v_min_i32_e32 v16, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 0x80000000, v16
 ; GFX8-NEXT:    v_max_i32_e32 v16, v16, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, s5, v17
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 0x7fffffff, v17
 ; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v16
 ; GFX8-NEXT:    v_min_i32_e32 v16, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 0x80000000, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v2
 ; GFX8-NEXT:    v_max_i32_e32 v16, v16, v18
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, s5, v17
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, 0x7fffffff, v17
 ; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v16
-; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
-; GFX8-NEXT:    v_min_i32_e32 v17, 0, v3
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX8-NEXT:    v_bfrev_b32_e32 v18, -2
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v3
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v17
-; GFX8-NEXT:    v_min_i32_e32 v17, 0, v4
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v4
-; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18,...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/68028


More information about the llvm-commits mailing list