[llvm] 27df165 - Revert "[amdgpu] Lower SGPR-to-VGPR copy in the final phase of ISel."

Fri Sep 18 06:48:40 PDT 2020

Author: Matt Arsenault
Date: 2020-09-18T09:48:33-04:00
New Revision: 27df1652709ba83d6b07f313297e7c796e36dce1

URL: https://github.com/llvm/llvm-project/commit/27df1652709ba83d6b07f313297e7c796e36dce1
DIFF: https://github.com/llvm/llvm-project/commit/27df1652709ba83d6b07f313297e7c796e36dce1.diff

LOG: Revert "[amdgpu] Lower SGPR-to-VGPR copy in the final phase of ISel."

This reverts commit c3492a1aa1b98c8d81b0969d52cea7681f0624c2.

I think this is the wrong strategy and wrong place to do this
transform anyway. Also reverts follow up commit
7d593d0d6905b55ca1124fca5e4d1ebb17203138.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/fabs.ll
    llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
    llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
    llvm/test/CodeGen/AMDGPU/wqm.ll

Removed: 
    llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 4df7fd85a5dd..a24394cdf795 100644

--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1244,11 +1244,6 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
         foldOperand(OpToFold, UseMI, OpNo, FoldList,
                     CopiesToReplace);
       } else {
-        // Skip updating literal use if it's used in the same REQ_SQUENCE as,
-        // if that literal could be inlined, it's just a single use.
-        if (NonInlineUse && NonInlineUse->getParent() == UseMI &&
-            UseMI->isRegSequence())
-          continue;
         if (++NumLiteralUses == 1) {
           NonInlineUse = &*Use;
           NonInlineUseOpNo = OpNo;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dde095b56fd9..577c27efc079 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -102,10 +102,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
   cl::desc("Use indirect register addressing for divergent indexes"),
   cl::init(false));
 
-static cl::opt<bool> EnableLowerSGPRToVGPRCopy(
-    "lower-sgpr-to-vgpr-copy", cl::Hidden,
-    cl::desc("Enable lowering copy from SGPR to VGPR"), cl::init(true));
-
 static bool hasFP32Denormals(const MachineFunction &MF) {
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   return Info->getMode().allFP32Denormals();
@@ -11484,60 +11480,6 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
   return false;
 }
 
-// Lower COPY from SGPR to VGPR to real one as they are real transfer instead
-// of COPY.
-static void lowerSGPRToVGPRCopy(MachineFunction &MF, MachineRegisterInfo &MRI,
-                                const SIRegisterInfo &TRI,
-                                const SIInstrInfo &TII) {
-  for (MachineBasicBlock &MBB : MF) {
-    for (auto BI = MBB.begin(), BE = MBB.end(); BI != BE; /*EMPTY*/) {
-      MachineInstr &MI = *BI++;
-
-      auto IsSGPRToVGPRCopy = [&MRI, &TRI](const MachineInstr &MI) {
-        if (!MI.isCopy())
-          return false;
-
-        auto DstReg = MI.getOperand(0).getReg();
-        auto SrcReg = MI.getOperand(1).getReg();
-        const auto *DstRC = DstReg.isVirtual() ? MRI.getRegClass(DstReg)
-                                               : TRI.getPhysRegClass(DstReg);
-        const auto *SrcRC = SrcReg.isVirtual() ? MRI.getRegClass(SrcReg)
-                                               : TRI.getPhysRegClass(SrcReg);
-        return (DstRC == &AMDGPU::VGPR_32RegClass ||
-                DstRC == &AMDGPU::VReg_64RegClass) &&
-               (SrcRC == &AMDGPU::SGPR_32RegClass ||
-                SrcRC == &AMDGPU::SGPR_64RegClass);
-      };
-
-      // Skip if it's not a copy from SGPR to VGPR.
-      if (!IsSGPRToVGPRCopy(MI))
-        continue;
-
-      const MachineOperand &Src = MI.getOperand(1);
-      // FIXME: Need subreg support.
-      if (Src.getSubReg() != AMDGPU::NoSubRegister)
-        continue;
-      // FIXME: Need undef support.
-      if (Src.getReg().isVirtual()) {
-        auto *DefMI = MRI.getVRegDef(Src.getReg());
-        if (!DefMI || DefMI->isImplicitDef())
-          continue;
-      }
-
-      LLVM_DEBUG(dbgs() << "Lower COPY: " << MI);
-      unsigned Opcode = (TRI.getRegSizeInBits(Src.getReg(), MRI) == 64)
-                            ? AMDGPU::V_MOV_B64_PSEUDO
-                            : AMDGPU::V_MOV_B32_e32;
-      auto DstReg = MI.getOperand(0).getReg();
-      auto MIB = BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(Opcode), DstReg)
-                     .add(MI.getOperand(1));
-      (void)MIB;
-      LLVM_DEBUG(dbgs() << "        to: " << *MIB.getInstr());
-      MI.eraseFromParent();
-    }
-  }
-}
-
 // Figure out which registers should be reserved for stack access. Only after
 // the function is legalized do we know all of the non-spill stack objects or if
 // calls are present.
@@ -11546,10 +11488,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
-  const SIInstrInfo *TII = Subtarget->getInstrInfo();
-
-  if (EnableLowerSGPRToVGPRCopy)
-    lowerSGPRToVGPRCopy(MF, MRI, *TRI, *TII);
 
   if (Info->isEntryFunction()) {
     // Callable functions have fixed registers used for stack access.

diff  --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 05f0bafb47c7..badaa16bbfcc 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -11,7 +11,7 @@
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; SI: s_bitset0_b32 s{{[0-9]+}}, 31
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
@@ -24,7 +24,7 @@ define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; SI: s_bitset0_b32 s{{[0-9]+}}, 31
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
@@ -36,7 +36,7 @@ define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
 ; FUNC-LABEL: {{^}}s_fabs_f32:
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; SI: s_bitset0_b32 s{{[0-9]+}}, 31
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
 ; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index afae6b43ee58..a621b04a346c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -34,7 +34,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x
 ; R600: |PV.{{[XYZW]}}|
 ; R600: -PV
 
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 ; VI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
@@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in)
 ; R600: |PV.{{[XYZW]}}|
 ; R600: -PV
 
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
@@ -59,7 +59,7 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %
 }
 
 ; FUNC-LABEL: {{^}}fneg_fabs_f32:
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
 define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   %fsub = fsub float -0.000000e+00, %fabs

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll
deleted file mode 100644
index f032f170e3b4..000000000000
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -o - %s | FileCheck %s
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
-target triple = "amdgcn-amd-amdhsa"
-
-; CHECK-LABEL: {{^}}t0:
-; CHECK: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0
-; CHECK-COUNT-1: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
-; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
-define protected amdgpu_kernel void @t0(float addrspace(1)* %p, i32 %i0, i32 %j0, i32 %k0) {
-entry:
-  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %i = add i32 %0, %i0
-  %j = add i32 %0, %j0
-  %k = add i32 %0, %k0
-  %pi = getelementptr float, float addrspace(1)* %p, i32 %i
-  %vi = load float, float addrspace(1)* %pi
-  %pj = getelementptr float, float addrspace(1)* %p, i32 %j
-  %vj = load float, float addrspace(1)* %pj
-  %sum = fadd float %vi, %vj
-  %pk = getelementptr float, float addrspace(1)* %p, i32 %k
-  store float %sum, float addrspace(1)* %pk
-  ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index 4d9c6a9a540f..4cbd89147722 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -153,9 +153,7 @@ bb:
 
 ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
 ; GCN:        flat_load_dword
-; GFX8_9:     s_waitcnt lgkmcnt(0){{$}}
-; GFX8_9:     s_waitcnt vmcnt(0){{$}}
-; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX10:      s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:   s_barrier
 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 860e58d33abf..127d0bc0fc68 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -650,12 +650,12 @@ main_body:
 ; CHECK: image_store
 ; CHECK: s_wqm_b64 exec, exec
 ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
-; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000
+; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
 
 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
 ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
 ; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
-; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
+; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
 ; CHECK: s_cbranch_vccz [[LOOPHDR]]
 
 ; CHECK: ; %break