[llvm] [AMDGPU] Fold operand after shrinking instruction in SIFoldOperands (PR #68426)

via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 6 09:00:09 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

<details>
<summary>Changes</summary>

D51345 added support for shrinking instructions in SIFoldOperands to
enable a fold, but did not actually do the fold. SIFoldOperands is run
twice so in many cases the first run did the shrink enabling the second
run to do the fold, but that only works for instructions that are
created early enough. Some shrinkable/foldable instructions are created
after the first run on SIFoldOperands, e.g. in SILoadStoreOptimizer.

This patch improves SIFoldOperands to do the fold immediately after
shrinking an instruction.


---

Patch is 364.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68426.diff


24 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+21-17) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+139-147) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll (+39-39) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll (+24-26) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+201-213) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll (+20-20) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+515-515) 
- (modified) llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll (+15-15) 
- (modified) llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir (+10-14) 
- (modified) llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir (+9-13) 
- (modified) llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink.mir (+19-32) 
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (+69-57) 
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+40-66) 
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+3-6) 
- (modified) llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll (+46-73) 
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+645-653) 
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+3-6) 
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+1-2) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1ebfa297f4fc339..a581acd7ea73aaf 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -198,9 +198,8 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
 
 bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
   MachineInstr *MI = Fold.UseMI;
-  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
-  assert(Old.isReg());
-
+  MachineOperand *Old = &MI->getOperand(Fold.UseOpNo);
+  assert(Old->isReg());
 
   const uint64_t TSFlags = MI->getDesc().TSFlags;
   if (Fold.isImm()) {
@@ -211,7 +210,7 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
       // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
       // already set.
       unsigned Opcode = MI->getOpcode();
-      int OpNo = MI->getOperandNo(&Old);
+      int OpNo = MI->getOperandNo(Old);
       int ModIdx = -1;
       if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
         ModIdx = AMDGPU::OpName::src0_modifiers;
@@ -236,11 +235,11 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
             if (!(Fold.ImmToFold & 0xffff)) {
               Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
               Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-              Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+              Old->ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
               return true;
             }
             Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-            Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
+            Old->ChangeToImmediate(Fold.ImmToFold & 0xffff);
             return true;
           }
           break;
@@ -251,7 +250,9 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
     }
   }
 
-  if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
+  if (Fold.needsShrink()) {
+    assert((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && "not handled");
+
     MachineBasicBlock *MBB = MI->getParent();
     auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
     if (Liveness != MachineBasicBlock::LQR_Dead) {
@@ -290,37 +291,40 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
 
     if (Fold.Commuted)
       TII->commuteInstruction(*Inst32, false);
-    return true;
-  }
 
-  assert(!Fold.needsShrink() && "not handled");
+    Fold.UseMI = Inst32;
+    Fold.UseOpNo = AMDGPU::getNamedOperandIdx(Fold.UseMI->getOpcode(),
+                                              AMDGPU::OpName::src0);
+    MI = Fold.UseMI;
+    Old = &MI->getOperand(Fold.UseOpNo);
+  }
 
   if (Fold.isImm()) {
-    if (Old.isTied()) {
+    if (Old->isTied()) {
       int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
       if (NewMFMAOpc == -1)
         return false;
       MI->setDesc(TII->get(NewMFMAOpc));
       MI->untieRegOperand(0);
     }
-    Old.ChangeToImmediate(Fold.ImmToFold);
+    Old->ChangeToImmediate(Fold.ImmToFold);
     return true;
   }
 
   if (Fold.isGlobal()) {
-    Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
-                   Fold.OpToFold->getTargetFlags());
+    Old->ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
+                    Fold.OpToFold->getTargetFlags());
     return true;
   }
 
   if (Fold.isFI()) {
-    Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
+    Old->ChangeToFrameIndex(Fold.FrameIndexToFold);
     return true;
   }
 
   MachineOperand *New = Fold.OpToFold;
-  Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
-  Old.setIsUndef(New->isUndef());
+  Old->substVirtReg(New->getReg(), New->getSubReg(), *TRI);
+  Old->setIsUndef(New->isUndef());
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index 26d1fbb09210c64..cd4b3150bd193b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -165,9 +165,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xffc0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s4, v1
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xffffffc0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
@@ -199,6 +198,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0xffffffc0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 4, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -232,6 +232,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xffffffc0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xffffffc0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
index a727ed39c79c659..c8c97dd072dc228 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
@@ -101,7 +101,7 @@ define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) {
 define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
 ; VI-LABEL: add_shl_vgpr_const_inline_const:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7e800
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7e800, v0
 ; VI-NEXT:    ; return to shader part epilog
 ;
@@ -124,7 +124,7 @@ define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
 define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) {
 ; VI-LABEL: add_shl_vgpr_inline_const_x2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
+; VI-NEXT:    v_mov_b32_e32 v0, 0x600
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x600, v0
 ; VI-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 701a733d9e8e957..59314737cf0628b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -4865,6 +4865,7 @@ define i32 @v_extract_v64i32_37(ptr addrspace(1) %ptr) {
 ; MOVREL-LABEL: v_extract_v64i32_37:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    v_mov_b32_e32 v0, 0x90
 ; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x90, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index cded5c94edf8cc3..3699206273574d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -231,14 +231,12 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_max_i32_e32 v1, v5, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -246,8 +244,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
@@ -512,15 +510,15 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x7fffffff, v8
 ; GFX6-NEXT:    v_max_i32_e32 v1, v10, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
+; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
@@ -1026,7 +1024,7 @@ define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
 ; GFX8-NEXT:    v_bfe_i32 v0, v1, 0, 24
 ; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 23, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xff800000
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xff800000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -1265,19 +1263,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v2i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_max_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
@@ -1286,19 +1282,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-LABEL: v_saddsat_v2i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v0
-; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX8-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_min_i32_e32 v4, 0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_max_i32_e32 v2, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s5, v4
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x80000000, v4
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX8-NEXT:    v_max_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
@@ -1383,26 +1377,25 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v3i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v7, 0, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v6, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, s5, v7
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s4, v6
+; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 0x80000000, v7
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x7fffffff, v6
 ; GFX6-NEXT:    v_max_i32_e32 v3, v7, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v1
+; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
@@ -1411,26 +1404,25 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-LABEL: v_saddsat_v3i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    v_min_i32_e32 v7, 0, v0
-; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v6, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s5, v7
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 0x80000000, v7
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x7fffffff, v6
 ; GFX8-NEXT:    v_max_i32_e32 v3, v7, v3
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v1
+; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
 ; GFX8-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i32_e32 v4, 0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s5, v4
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x80000000, v4
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
@@ -1536,26 +1528,24 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v9, 0, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, s5, v9
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 0x80000000, v9
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x7fffffff, v8
 ; GFX6-NEXT:    v_max_i32_e32 v4, v9, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x80000000, v8
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_max_i32_e32 v5, v8, v5
 ; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v3
@@ -1571,26 +1561,24 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-LABEL: v_saddsat_v4i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    v_min_i32_e32 v9, 0, v0
-; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v8, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, s5, v9
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, 0x80000000, v9
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 0x7fffffff, v8
 ; GFX8-NEXT:    v_max_i32_e32 v4, v9, v4
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX8-NEXT:    v_min_i32_e32 v8, 0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s5, v8
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 0x80000000, v8
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX8-NEXT:    v_max_i32_e32 v5, v8, v5
 ; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v3
@@ -1724,34 +1712,32 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v5i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_min_i32_e32 v12, 0, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
+; GFX6-NEXT:    v_min_i32_e32 v11, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v10, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s5, v12
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s4, v10
-; GFX6-NEXT:    v_max_i32_e32 v5, v12, v5
+; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, 0x80000000, v11
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 0x7fffffff, v10
+; GFX6-NEXT:    v_max_i32_e32 v5, v11, v5
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v1
+; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 0x80000000, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v10, v6
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v7
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x7fffffff, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v4
@@ -1767,34 +1753,32 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-LABEL: v_saddsat...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/68426


More information about the llvm-commits mailing list