[PATCH] D117075: [AMDGPU][WQM] Set the lanes that should be deactivated with VCC.

Thu Jan 20 20:08:52 PST 2022

kmitropoulou updated this revision to Diff 401848.
kmitropoulou marked an inline comment as done.
kmitropoulou added a comment.

D117075 <https://reviews.llvm.org/D117075>: [AMDGPU][WQM] Set the lanes that should be deactivated with VCC.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117075/new/

https://reviews.llvm.org/D117075

Files:
  llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
  llvm/test/CodeGen/AMDGPU/wqm.ll


Index: llvm/test/CodeGen/AMDGPU/wqm.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/wqm.ll
+++ llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3330,6 +3330,50 @@
   ret float %out
 }
 
+; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
+; vector comparisons in Wave32 mode.
+define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(float addrspace(6)* inreg %0) {
+; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
+; GFX9-W64:       ; %bb.0: ; %main_body
+; GFX9-W64-NEXT:    s_mov_b32 s3, 0x31016fac
+; GFX9-W64-NEXT:    s_mov_b32 s2, 32
+; GFX9-W64-NEXT:    s_mov_b32 s1, 0x8000
+; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
+; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W64-NEXT:    v_cmp_le_f32_e64 vcc, s0, 0
+; GFX9-W64-NEXT:    s_andn2_b64 s[4:5], exec, vcc
+; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB50_1
+; GFX9-W64-NEXT:    s_endpgm
+; GFX9-W64-NEXT:  .LBB50_1:
+; GFX9-W64-NEXT:    s_mov_b64 exec, 0
+; GFX9-W64-NEXT:    exp null off, off, off, off done vm
+; GFX9-W64-NEXT:    s_endpgm
+;
+; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
+; GFX10-W32:       ; %bb.0: ; %main_body
+; GFX10-W32-NEXT:    s_mov_b32 s3, 0x31016fac
+; GFX10-W32-NEXT:    s_mov_b32 s2, 32
+; GFX10-W32-NEXT:    s_mov_b32 s1, 0x8000
+; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
+; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-W32-NEXT:    v_cmp_le_f32_e64 vcc_lo, s0, 0
+; GFX10-W32-NEXT:    s_andn2_b32 s4, exec_lo, vcc_lo
+; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB50_1
+; GFX10-W32-NEXT:    s_endpgm
+; GFX10-W32-NEXT:  .LBB50_1:
+; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-W32-NEXT:    exp null off, off, off, off done vm
+; GFX10-W32-NEXT:    s_endpgm
+main_body:
+  %1 = ptrtoint float addrspace(6)* %0 to i32
+  %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
+  %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
+  %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
+  call void @llvm.amdgcn.kill(i1 %4) #1
+  ret void
+}
+
+
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
 
@@ -3361,6 +3405,7 @@
 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
 
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
@@ -3368,3 +3413,4 @@
 attributes #4 = { nounwind readnone convergent }
 attributes #5 = { "amdgpu-ps-wqm-outputs" }
 attributes #6 = { nounwind "InitialPSInputAddr"="2" }
+attributes #7 = { nounwind readnone willreturn }
Index: llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -861,12 +861,16 @@
   MachineInstr *VcmpMI;
   const MachineOperand &Op0 = MI.getOperand(0);
   const MachineOperand &Op1 = MI.getOperand(1);
+
+  // VCC represents lanes killed.
+  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+
   if (TRI->isVGPR(*MRI, Op0.getReg())) {
     Opcode = AMDGPU::getVOPe32(Opcode);
     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
   } else {
     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
-                 .addReg(AMDGPU::VCC, RegState::Define)
+                 .addReg(VCC, RegState::Define)
                  .addImm(0) // src0 modifiers
                  .add(Op1)
                  .addImm(0) // src1 modifiers
@@ -874,9 +878,6 @@
                  .addImm(0); // omod
   }
 
-  // VCC represents lanes killed.
-  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
-
   MachineInstr *MaskUpdateMI =
       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
           .addReg(LiveMaskReg)


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D117075.401848.patch
Type: text/x-patch
Size: 4087 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220121/68623e36/attachment.bin>