[PATCH] D117075: [AMDGPU][WQM] Set the lanes that should be deactivated with VCC.
Konstantina Mitropoulou via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 20 20:08:52 PST 2022
kmitropoulou updated this revision to Diff 401848.
kmitropoulou marked an inline comment as done.
kmitropoulou added a comment.
D117075 <https://reviews.llvm.org/D117075>: [AMDGPU][WQM] Set the lanes that should be deactivated with VCC.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D117075/new/
https://reviews.llvm.org/D117075
Files:
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
llvm/test/CodeGen/AMDGPU/wqm.ll
Index: llvm/test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/wqm.ll
+++ llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3330,6 +3330,50 @@
ret float %out
}
+; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
+; vector comparisons in Wave32 mode.
+define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(float addrspace(6)* inreg %0) {
+; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
+; GFX9-W64: ; %bb.0: ; %main_body
+; GFX9-W64-NEXT: s_mov_b32 s3, 0x31016fac
+; GFX9-W64-NEXT: s_mov_b32 s2, 32
+; GFX9-W64-NEXT: s_mov_b32 s1, 0x8000
+; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
+; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0
+; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc
+; GFX9-W64-NEXT: s_cbranch_scc0 .LBB50_1
+; GFX9-W64-NEXT: s_endpgm
+; GFX9-W64-NEXT: .LBB50_1:
+; GFX9-W64-NEXT: s_mov_b64 exec, 0
+; GFX9-W64-NEXT: exp null off, off, off, off done vm
+; GFX9-W64-NEXT: s_endpgm
+;
+; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
+; GFX10-W32: ; %bb.0: ; %main_body
+; GFX10-W32-NEXT: s_mov_b32 s3, 0x31016fac
+; GFX10-W32-NEXT: s_mov_b32 s2, 32
+; GFX10-W32-NEXT: s_mov_b32 s1, 0x8000
+; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
+; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0
+; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo
+; GFX10-W32-NEXT: s_cbranch_scc0 .LBB50_1
+; GFX10-W32-NEXT: s_endpgm
+; GFX10-W32-NEXT: .LBB50_1:
+; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
+; GFX10-W32-NEXT: exp null off, off, off, off done vm
+; GFX10-W32-NEXT: s_endpgm
+main_body:
+ %1 = ptrtoint float addrspace(6)* %0 to i32
+ %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
+ %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
+ %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
+ call void @llvm.amdgcn.kill(i1 %4) #1
+ ret void
+}
+
+
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
@@ -3361,6 +3405,7 @@
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }
@@ -3368,3 +3413,4 @@
attributes #4 = { nounwind readnone convergent }
attributes #5 = { "amdgpu-ps-wqm-outputs" }
attributes #6 = { nounwind "InitialPSInputAddr"="2" }
+attributes #7 = { nounwind readnone willreturn }
Index: llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -861,12 +861,16 @@
MachineInstr *VcmpMI;
const MachineOperand &Op0 = MI.getOperand(0);
const MachineOperand &Op1 = MI.getOperand(1);
+
+ // VCC represents lanes killed.
+ Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+
if (TRI->isVGPR(*MRI, Op0.getReg())) {
Opcode = AMDGPU::getVOPe32(Opcode);
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
} else {
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .addReg(AMDGPU::VCC, RegState::Define)
+ .addReg(VCC, RegState::Define)
.addImm(0) // src0 modifiers
.add(Op1)
.addImm(0) // src1 modifiers
@@ -874,9 +878,6 @@
.addImm(0); // omod
}
- // VCC represents lanes killed.
- Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
-
MachineInstr *MaskUpdateMI =
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D117075.401848.patch
Type: text/x-patch
Size: 4087 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220121/68623e36/attachment.bin>
More information about the llvm-commits
mailing list