[llvm] aa418b9 - [AMDGPU][SIWholeQuadMode] Use the right VCC register to activate the correct lanes.
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 26 08:55:19 PST 2022
Author: Konstantina
Date: 2022-01-26T08:54:39-08:00
New Revision: aa418b91332ca9ea0686ae53ef916456dffcc31c
URL: https://github.com/llvm/llvm-project/commit/aa418b91332ca9ea0686ae53ef916456dffcc31c
DIFF: https://github.com/llvm/llvm-project/commit/aa418b91332ca9ea0686ae53ef916456dffcc31c.diff
LOG: [AMDGPU][SIWholeQuadMode] Use the right VCC register to activate the correct lanes.
Reviewed By: critson
Differential Revision: https://reviews.llvm.org/D118096
Added:
Modified:
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
llvm/test/CodeGen/AMDGPU/wqm.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 77ee3c0ff0e4..46efb3c605c6 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -861,12 +861,16 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
MachineInstr *VcmpMI;
const MachineOperand &Op0 = MI.getOperand(0);
const MachineOperand &Op1 = MI.getOperand(1);
+
+ // VCC represents lanes killed.
+ Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+
if (TRI->isVGPR(*MRI, Op0.getReg())) {
Opcode = AMDGPU::getVOPe32(Opcode);
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
} else {
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .addReg(AMDGPU::VCC, RegState::Define)
+ .addReg(VCC, RegState::Define)
.addImm(0) // src0 modifiers
.add(Op1)
.addImm(0) // src1 modifiers
@@ -874,9 +878,6 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
.addImm(0); // omod
}
- // VCC represents lanes killed.
- Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
-
MachineInstr *MaskUpdateMI =
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index bd2f5459003c..8addc51e8eaa 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3330,6 +3330,49 @@ main_body:
ret float %out
}
+; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
+; vector comparisons in Wave32 mode.
+define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(float addrspace(6)* inreg %0) {
+; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
+; GFX9-W64: ; %bb.0: ; %main_body
+; GFX9-W64-NEXT: s_mov_b32 s3, 0x31016fac
+; GFX9-W64-NEXT: s_mov_b32 s2, 32
+; GFX9-W64-NEXT: s_mov_b32 s1, 0x8000
+; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
+; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0
+; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc
+; GFX9-W64-NEXT: s_cbranch_scc0 .LBB50_1
+; GFX9-W64-NEXT: s_endpgm
+; GFX9-W64-NEXT: .LBB50_1:
+; GFX9-W64-NEXT: s_mov_b64 exec, 0
+; GFX9-W64-NEXT: exp null off, off, off, off done vm
+; GFX9-W64-NEXT: s_endpgm
+;
+; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
+; GFX10-W32: ; %bb.0: ; %main_body
+; GFX10-W32-NEXT: s_mov_b32 s3, 0x31016fac
+; GFX10-W32-NEXT: s_mov_b32 s2, 32
+; GFX10-W32-NEXT: s_mov_b32 s1, 0x8000
+; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
+; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0
+; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo
+; GFX10-W32-NEXT: s_cbranch_scc0 .LBB50_1
+; GFX10-W32-NEXT: s_endpgm
+; GFX10-W32-NEXT: .LBB50_1:
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
+; GFX10-W32-NEXT: exp null off, off, off, off done vm
+; GFX10-W32-NEXT: s_endpgm
+main_body:
+ %1 = ptrtoint float addrspace(6)* %0 to i32
+ %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
+ %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
+ %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
+ call void @llvm.amdgcn.kill(i1 %4) #1
+ ret void
+}
+
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
@@ -3361,6 +3404,7 @@ declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1,
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }
@@ -3368,3 +3412,4 @@ attributes #3 = { nounwind readnone }
attributes #4 = { nounwind readnone convergent }
attributes #5 = { "amdgpu-ps-wqm-outputs" }
attributes #6 = { nounwind "InitialPSInputAddr"="2" }
+attributes #7 = { nounwind readnone willreturn }
More information about the llvm-commits
mailing list