[llvm-branch-commits] [llvm] a80ebd0 - [AMDGPU] Fix llvm.amdgcn.init.exec and frame materialization
Carl Ritson via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Jan 24 15:55:14 PST 2021
Author: Carl Ritson
Date: 2021-01-25T08:31:17+09:00
New Revision: a80ebd01798ca82a4f5ffd6d355c5c9facd83375
URL: https://github.com/llvm/llvm-project/commit/a80ebd01798ca82a4f5ffd6d355c5c9facd83375
DIFF: https://github.com/llvm/llvm-project/commit/a80ebd01798ca82a4f5ffd6d355c5c9facd83375.diff
LOG: [AMDGPU] Fix llvm.amdgcn.init.exec and frame materialization
Frame-base materialization may insert vector instructions before EXEC is initialised.
Fix this by moving lowering of llvm.amdgcn.init.exec later in backend.
Also remove SI_INIT_EXEC_LO pseudo as this is not necessary.
Reviewed By: ruiling
Differential Revision: https://reviews.llvm.org/D94645
Added:
Modified:
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2cab7f38e281..5b8b563df40a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -182,6 +182,8 @@ def int_amdgcn_init_exec : Intrinsic<[],
// Set EXEC according to a thread count packed in an SGPR input:
// thread_count = (input >> bitoffset) & 0x7f;
// This is always moved to the beginning of the basic block.
+// Note: only inreg arguments to the parent function are valid as
+// inputs to this intrinsic, computed values cannot be used.
def int_amdgcn_init_exec_from_input : Intrinsic<[],
[llvm_i32_ty, // 32-bit SGPR input
llvm_i32_ty], // bit offset of the thread count
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e959c5f0f8d3..839437b5e3f8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4021,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
- case AMDGPU::SI_INIT_EXEC:
- // This should be before all vector instructions.
- BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
- AMDGPU::EXEC)
- .addImm(MI.getOperand(0).getImm());
- MI.eraseFromParent();
- return BB;
-
- case AMDGPU::SI_INIT_EXEC_LO:
- // This should be before all vector instructions.
- BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
- AMDGPU::EXEC_LO)
- .addImm(MI.getOperand(0).getImm());
- MI.eraseFromParent();
- return BB;
-
- case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
- // Extract the thread count from an SGPR input and set EXEC accordingly.
- // Since BFM can't shift by 64, handle that case with CMP + CMOV.
- //
- // S_BFE_U32 count, input, {shift, 7}
- // S_BFM_B64 exec, count, 0
- // S_CMP_EQ_U32 count, 64
- // S_CMOV_B64 exec, -1
- MachineInstr *FirstMI = &*BB->begin();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- Register InputReg = MI.getOperand(0).getReg();
- Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- bool Found = false;
-
- // Move the COPY of the input reg to the beginning, so that we can use it.
- for (auto I = BB->begin(); I != &MI; I++) {
- if (I->getOpcode() != TargetOpcode::COPY ||
- I->getOperand(0).getReg() != InputReg)
- continue;
-
- if (I == FirstMI) {
- FirstMI = &*++BB->begin();
- } else {
- I->removeFromParent();
- BB->insert(FirstMI, &*I);
- }
- Found = true;
- break;
- }
- assert(Found);
- (void)Found;
-
- // This should be before all vector instructions.
- unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
- bool isWave32 = getSubtarget()->isWave32();
- unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
- .addReg(InputReg)
- .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
- BuildMI(*BB, FirstMI, DebugLoc(),
- TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
- Exec)
- .addReg(CountReg)
- .addImm(0);
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
- .addReg(CountReg, RegState::Kill)
- .addImm(getSubtarget()->getWavefrontSize());
- BuildMI(*BB, FirstMI, DebugLoc(),
- TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
- Exec)
- .addImm(-1);
- MI.eraseFromParent();
- return BB;
- }
-
case AMDGPU::GET_GROUPSTATICSIZE: {
assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 732e9d76127e..7c1cbd67c993 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -399,32 +399,13 @@ def SI_INIT_EXEC : SPseudoInstSI <
(outs), (ins i64imm:$src),
[(int_amdgcn_init_exec (i64 timm:$src))]> {
let Defs = [EXEC];
- let usesCustomInserter = 1;
- let isAsCheapAsAMove = 1;
- let WaveSizePredicate = isWave64;
-}
-
-// FIXME: Intrinsic should be mangled for wave size.
-def SI_INIT_EXEC_LO : SPseudoInstSI <
- (outs), (ins i32imm:$src), []> {
- let Defs = [EXEC_LO];
- let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
- let WaveSizePredicate = isWave32;
}
-// FIXME: Wave32 version
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
(outs), (ins SSrc_b32:$input, i32imm:$shift),
[(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
let Defs = [EXEC];
- let usesCustomInserter = 1;
-}
-
-def : GCNPat <
- (int_amdgcn_init_exec timm:$src),
- (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
- let WaveSizePredicate = isWave32;
}
// Return for returning shaders to a shader variant epilog.
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 9ba5f8f8e857..5839e59b4d7f 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -93,6 +93,8 @@ class SILowerControlFlow : public MachineFunctionPass {
MachineBasicBlock *emitEndCf(MachineInstr &MI);
+ void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
+
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;
@@ -661,6 +663,90 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
return SplitBB;
}
+void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
+ MachineInstr &MI) {
+ MachineFunction &MF = *MBB->getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ bool IsWave32 = ST.isWave32();
+
+ if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
+ // This should be before all vector instructions.
+ BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
+ .addImm(MI.getOperand(0).getImm());
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Extract the thread count from an SGPR input and set EXEC accordingly.
+ // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+ //
+ // S_BFE_U32 count, input, {shift, 7}
+ // S_BFM_B64 exec, count, 0
+ // S_CMP_EQ_U32 count, 64
+ // S_CMOV_B64 exec, -1
+ Register InputReg = MI.getOperand(0).getReg();
+ MachineInstr *FirstMI = &*MBB->begin();
+ if (InputReg.isVirtual()) {
+ MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
+ assert(DefInstr && DefInstr->isCopy());
+ if (DefInstr->getParent() == MBB) {
+ if (DefInstr != FirstMI) {
+ // If the `InputReg` is defined in current block, we also need to
+ // move that instruction to the beginning of the block.
+ DefInstr->removeFromParent();
+ MBB->insert(FirstMI, DefInstr);
+ if (LIS)
+ LIS->handleMove(*DefInstr);
+ } else {
+ // If first instruction is definition then move pointer after it.
+ FirstMI = &*std::next(FirstMI->getIterator());
+ }
+ }
+ }
+
+ // Insert instruction sequence at block beginning (before vector operations).
+ const DebugLoc DL = MI.getDebugLoc();
+ const unsigned WavefrontSize = ST.getWavefrontSize();
+ const unsigned Mask = (WavefrontSize << 1) - 1;
+ Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
+ .addReg(InputReg)
+ .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+ auto BfmMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+ .addReg(CountReg)
+ .addImm(0);
+ auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addReg(CountReg, RegState::Kill)
+ .addImm(WavefrontSize);
+ auto CmovMI =
+ BuildMI(*MBB, FirstMI, DL,
+ TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+ Exec)
+ .addImm(-1);
+
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+
+ LIS->InsertMachineInstrInMaps(*BfeMI);
+ LIS->InsertMachineInstrInMaps(*BfmMI);
+ LIS->InsertMachineInstrInMaps(*CmpMI);
+ LIS->InsertMachineInstrInMaps(*CmovMI);
+
+ LIS->removeInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(InputReg);
+ LIS->createAndComputeVirtRegInterval(CountReg);
+}
+
bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * {
auto *S = B->getNextNode();
@@ -781,6 +867,14 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
SplitMBB = process(MI);
break;
+ // FIXME: find a better place for this
+ case AMDGPU::SI_INIT_EXEC:
+ case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
+ lowerInitExec(MBB, MI);
+ if (LIS)
+ LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
+ break;
+
default:
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
index f3998d9a899d..27adb5c3d026 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
@@ -84,6 +84,117 @@ main_body:
unreachable
}
+; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
+; GCN-NOT: {{^}}v_
+; GCN: s_mov_b64 exec, -1
+; GCN: v_mov
+; GCN: v_add
+define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
+main_body:
+ %array0 = alloca [1024 x i32], align 16, addrspace(5)
+ %array1 = alloca [20 x i32], align 16, addrspace(5)
+ call void @llvm.amdgcn.init.exec(i64 -1)
+
+ %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
+ store i32 %a, i32 addrspace(5)* %ptr0, align 4
+
+ %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
+ store i32 %a, i32 addrspace(5)* %ptr1, align 4
+
+ %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
+ store i32 %b, i32 addrspace(5)* %ptr2, align 4
+
+ %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
+ %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
+
+ %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
+ %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
+
+ %v5 = add i32 %v3, %v4
+ %v = bitcast i32 %v5 to float
+ ret float %v
+}
+
+; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
+; GCN-NOT: {{^}}v_
+; GCN: s_bfe_u32 s2, s2, 0x70008
+; GCN-NEXT: s_bfm_b64 exec, s2, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 64
+; GCN-NEXT: s_cmov_b64 exec, -1
+; GCN: v_mov
+; GCN: v_add
+define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
+main_body:
+ %array0 = alloca [1024 x i32], align 16, addrspace(5)
+ %array1 = alloca [20 x i32], align 16, addrspace(5)
+ call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
+
+ %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
+ store i32 %a, i32 addrspace(5)* %ptr0, align 4
+
+ %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
+ store i32 %a, i32 addrspace(5)* %ptr1, align 4
+
+ %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
+ store i32 %b, i32 addrspace(5)* %ptr2, align 4
+
+ %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
+ %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
+
+ %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
+ %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
+
+ %v5 = add i32 %v3, %v4
+ %v = bitcast i32 %v5 to float
+ ret float %v
+}
+
+; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
+; GCN-NOT: {{^}}v_
+; GCN: %endif
+; GCN: s_bfe_u32 s3, s2, 0x70008
+; GCN-NEXT: s_bfm_b64 exec, s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s3, 64
+; GCN-NEXT: s_cmov_b64 exec, -1
+; GCN: v_mov
+; GCN: v_add
+define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
+main_body:
+ ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
+ %array0 = alloca [1024 x i32], align 16, addrspace(5)
+ %array1 = alloca [20 x i32], align 16, addrspace(5)
+
+ %cc = icmp uge i32 %count, 32
+ br i1 %cc, label %endif, label %if
+
+if:
+ call void asm sideeffect "", ""()
+ br label %endif
+
+endif:
+ call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
+
+ %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
+ store i32 %a, i32 addrspace(5)* %ptr0, align 4
+
+ %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
+ store i32 %a, i32 addrspace(5)* %ptr1, align 4
+
+ %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
+ store i32 %b, i32 addrspace(5)* %ptr2, align 4
+
+ %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
+ %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
+
+ %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
+ %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
+
+ %v5 = add i32 %v3, %v4
+ %v6 = add i32 %v5, %count
+ %v = bitcast i32 %v6 to float
+ ret float %v
+}
+
declare void @llvm.amdgcn.init.exec(i64) #1
declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
More information about the llvm-branch-commits
mailing list