[llvm] f2284e3 - [Sink] Optimize/simplify sink candidate finding with nearest common dominator

Fangrui Song via llvm-commits llvm-commits at lists.llvm.org
Sun Aug 30 22:52:04 PDT 2020


Author: Fangrui Song
Date: 2020-08-30T22:51:00-07:00
New Revision: f2284e3405d87143b2478f28b8045bef84953c91

URL: https://github.com/llvm/llvm-project/commit/f2284e3405d87143b2478f28b8045bef84953c91
DIFF: https://github.com/llvm/llvm-project/commit/f2284e3405d87143b2478f28b8045bef84953c91.diff

LOG: [Sink] Optimize/simplify sink candidate finding with nearest common dominator

For an instruction in the basic block BB, SinkingPass enumerates basic blocks
dominated by BB and BB's successors. For each enumerated basic block,
SinkingPass uses `AllUsesDominatedByBlock` to check whether the basic
block dominates all of the instruction's users. This is inefficient.

Use the nearest common dominator of all users to avoid enumerating the
candidate. The nearest common dominator may be in a parent loop which is
not beneficial. In that case, find the ancestors in the dominator tree.

In the case that the instruction has no user, with this change we will
not perform unnecessary move. This causes some amdgpu test changes.

A stage-2 x86-64 clang is a byte identical with this change.

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/Sink.cpp
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/setcc.ll
    llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
index 48f289c8f17d..c430724ab1ac 100644
--- a/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -32,31 +32,6 @@ using namespace llvm;
 STATISTIC(NumSunk, "Number of instructions sunk");
 STATISTIC(NumSinkIter, "Number of sinking iterations");
 
-/// AllUsesDominatedByBlock - Return true if all uses of the specified value
-/// occur in blocks dominated by the specified block.
-static bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB,
-                                    DominatorTree &DT) {
-  // Ignoring debug uses is necessary so debug info doesn't affect the code.
-  // This may leave a referencing dbg_value in the original block, before
-  // the definition of the vreg.  Dwarf generator handles this although the
-  // user might not get the right info at runtime.
-  for (Use &U : Inst->uses()) {
-    // Determine the block of the use.
-    Instruction *UseInst = cast<Instruction>(U.getUser());
-    BasicBlock *UseBlock = UseInst->getParent();
-    if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
-      // PHI nodes use the operand in the predecessor block, not the block with
-      // the PHI.
-      unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
-      UseBlock = PN->getIncomingBlock(Num);
-    }
-    // Check that it dominates.
-    if (!DT.dominates(BB, UseBlock))
-      return false;
-  }
-  return true;
-}
-
 static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
                          SmallPtrSetImpl<Instruction *> &Stores) {
 
@@ -97,11 +72,6 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
   assert(Inst && "Instruction to be sunk is null");
   assert(SuccToSinkTo && "Candidate sink target is null");
 
-  // It is not possible to sink an instruction into its own block.  This can
-  // happen with loops.
-  if (Inst->getParent() == SuccToSinkTo)
-    return false;
-
   // It's never legal to sink an instruction into a block which terminates in an
   // EH-pad.
   if (SuccToSinkTo->getTerminator()->isExceptionalTerminator())
@@ -129,9 +99,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
       return false;
   }
 
-  // Finally, check that all the uses of the instruction are actually
-  // dominated by the candidate
-  return AllUsesDominatedByBlock(Inst, SuccToSinkTo, DT);
+  return true;
 }
 
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
@@ -162,25 +130,34 @@ static bool SinkInstruction(Instruction *Inst,
   // decide.
   BasicBlock *SuccToSinkTo = nullptr;
 
-  // Instructions can only be sunk if all their uses are in blocks
-  // dominated by one of the successors.
-  // Look at all the dominated blocks and see if we can sink it in one.
-  DomTreeNode *DTN = DT.getNode(Inst->getParent());
-  for (auto I = DTN->begin(), E = DTN->end(); I != E && SuccToSinkTo == nullptr;
-       ++I) {
-    BasicBlock *Candidate = (*I)->getBlock();
-    // A node always immediate-dominates its children on the dominator
-    // tree.
-    if (IsAcceptableTarget(Inst, Candidate, DT, LI))
-      SuccToSinkTo = Candidate;
+  // Find the nearest common dominator of all users as the candidate.
+  BasicBlock *BB = Inst->getParent();
+  for (Use &U : Inst->uses()) {
+    Instruction *UseInst = cast<Instruction>(U.getUser());
+    BasicBlock *UseBlock = UseInst->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+      UseBlock = PN->getIncomingBlock(Num);
+    }
+    if (SuccToSinkTo)
+      SuccToSinkTo = DT.findNearestCommonDominator(SuccToSinkTo, UseBlock);
+    else
+      SuccToSinkTo = UseBlock;
+    // The current basic block needs to dominate the candidate.
+    if (!DT.dominates(BB, SuccToSinkTo))
+      return false;
   }
 
-  // If no suitable postdominator was found, look at all the successors and
-  // decide which one we should sink to, if any.
-  for (succ_iterator I = succ_begin(Inst->getParent()),
-      E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
-    if (IsAcceptableTarget(Inst, *I, DT, LI))
-      SuccToSinkTo = *I;
+  if (SuccToSinkTo) {
+    // The nearest common dominator may be in a parent loop of BB, which may not
+    // be beneficial. Find an ancestor.
+    while (SuccToSinkTo != BB &&
+           !IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI))
+      SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
+    if (SuccToSinkTo == BB)
+      SuccToSinkTo = nullptr;
   }
 
   // If we couldn't find a block to sink to, ignore this instruction.

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 1d235d1db5cd..091959adcd71 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -240,15 +240,15 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
-; GCN-IR-NEXT:  BB0_7: ; %Flow7
+; GCN-IR-NEXT:  BB0_7: ; %udiv-end
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[8:9], s[2:3]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 %x, %y
@@ -411,26 +411,26 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v10
 ; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v14, v7, v0, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v11, vcc, v13, v14
-; GCN-IR-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[11:12]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[11:12]
+; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v13, v14
+; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[7:8]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[7:8]
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v18, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v10, 0, s[6:7]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v12, v10, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v15, v18
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v9, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, 1, v11
-; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, 0, v12, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v0, s[4:5], 63, v11
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[16:17], v[11:12]
+; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, 1, v7
+; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, 0, v8, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v0, s[4:5], 63, v7
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[16:17], v[7:8]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[9:10], v0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
@@ -480,14 +480,14 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:  BB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[7:8], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v12, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v11, v2
 ; GCN-IR-NEXT:  BB1_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v2, v5, v4
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GCN-IR-NEXT:    v_xor_b32_e32 v3, v7, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v3, v12, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1111,7 +1111,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
-; GCN-IR-NEXT:  BB9_7: ; %Flow4
+; GCN-IR-NEXT:  BB9_7: ; %udiv-end
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[6:7], s[2:3]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -1341,9 +1341,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 24, %x

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll
index 91fec72cab51..a259784bc278 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc.ll
@@ -397,9 +397,9 @@ endif:
 }
 
 ; FUNC-LABEL: setcc-i1-and-xor
-; GCN-DAG: v_cmp_ge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_cmp_le_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
-; GCN: s_and_b64 s[2:3], [[A]], [[B]]
+; GCN-DAG: v_cmp_nge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
+; GCN-DAG: v_cmp_nle_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
+; GCN: s_or_b64 s[2:3], [[A]], [[B]]
 define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
 bb0:
   %tmp5 = fcmp oge float %cond, 0.000000e+00

diff  --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 7387e98ae864..06f09e8e4d07 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -221,11 +221,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; SI-NEXT:    s_and_b64 vcc, exec, s[14:15]
 ; SI-NEXT:    s_cbranch_vccz BB3_13
 ; SI-NEXT:  ; %bb.10: ; %for.cond.preheader
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, 0x3e8
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, s8, v0
-; SI-NEXT:    s_and_b64 vcc, exec, vcc
-; SI-NEXT:    s_cbranch_vccz BB3_13
+; SI-NEXT:    s_cmpk_lt_i32 s8, 0x3e8
+; SI-NEXT:    s_cbranch_scc0 BB3_13
 ; SI-NEXT:  ; %bb.11: ; %for.body
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
 ; SI-NEXT:  BB3_12: ; %self.loop
@@ -295,10 +292,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; FLAT-NEXT:    s_and_b64 vcc, exec, s[14:15]
 ; FLAT-NEXT:    s_cbranch_vccz BB3_13
 ; FLAT-NEXT:  ; %bb.10: ; %for.cond.preheader
-; FLAT-NEXT:    v_mov_b32_e32 v0, 0x3e8
-; FLAT-NEXT:    v_cmp_lt_i32_e32 vcc, s8, v0
-; FLAT-NEXT:    s_and_b64 vcc, exec, vcc
-; FLAT-NEXT:    s_cbranch_vccz BB3_13
+; FLAT-NEXT:    s_cmpk_lt_i32 s8, 0x3e8
+; FLAT-NEXT:    s_cbranch_scc0 BB3_13
 ; FLAT-NEXT:  ; %bb.11: ; %for.body
 ; FLAT-NEXT:    s_and_b64 vcc, exec, 0
 ; FLAT-NEXT:  BB3_12: ; %self.loop

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 1f961fcf1a3b..cd0b7f77af43 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -218,14 +218,16 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    s_mov_b32 s8, s4
+; GCN-IR-NEXT:    s_mov_b32 s9, s5
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
@@ -1026,56 +1028,56 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
 ; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[0:1], 31
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 31
+; GCN-IR-NEXT:    s_mov_b32 s1, s0
 ; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 31
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[2:3]
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[10:11], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT:    s_sub_u32 s8, s8, s0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, s9, s0
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], s[10:11], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s8, s6, s2
+; GCN-IR-NEXT:    s_subb_u32 s9, s7, s2
+; GCN-IR-NEXT:    s_sub_u32 s10, s10, s0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s10
+; GCN-IR-NEXT:    s_subb_u32 s11, s11, s0
 ; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s11
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s11, 0
 ; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s9
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
 ; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[12:13], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_xor_b64 s[10:11], s[0:1], -1
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], vcc
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[8:9], v0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
-; GCN-IR-NEXT:    s_add_u32 s10, s8, -1
-; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v4
+; GCN-IR-NEXT:    s_add_u32 s6, s10, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[8:9], v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_addc_u32 s11, s9, -1
+; GCN-IR-NEXT:    s_addc_u32 s7, s11, -1
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
@@ -1086,13 +1088,13 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s10, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s6, v6
 ; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v10, s8, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s10, v8
 ; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v11, s9, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s11, v8
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
@@ -1107,9 +1109,9 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_3
 ; GCN-IR-NEXT:    s_branch BB8_6
 ; GCN-IR-NEXT:  BB8_4:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
 ; GCN-IR-NEXT:    s_branch BB8_7
 ; GCN-IR-NEXT:  BB8_5:
@@ -1120,22 +1122,22 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GCN-IR-NEXT:  BB8_7: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s9, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s8, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s10, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s10, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s11, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s10, v0
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 31
@@ -1525,8 +1527,8 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = srem i64 24, %x

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 53af9618271d..a0eba73e7d0f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -218,14 +218,16 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    s_mov_b32 s8, s4
+; GCN-IR-NEXT:    s_mov_b32 s9, s5
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
@@ -938,13 +940,15 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
@@ -1136,13 +1140,15 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, v0, 24
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, 24
   store i64 %result, i64 addrspace(1)* %out


        


More information about the llvm-commits mailing list