[llvm] aa418b9 - [AMDGPU][SIWholeQuadMode] Use the right VCC register to activate the correct lanes.

Wed Jan 26 08:55:19 PST 2022

Author: Konstantina
Date: 2022-01-26T08:54:39-08:00
New Revision: aa418b91332ca9ea0686ae53ef916456dffcc31c

URL: https://github.com/llvm/llvm-project/commit/aa418b91332ca9ea0686ae53ef916456dffcc31c
DIFF: https://github.com/llvm/llvm-project/commit/aa418b91332ca9ea0686ae53ef916456dffcc31c.diff

LOG: [AMDGPU][SIWholeQuadMode] Use the right VCC register to activate the correct lanes.

Reviewed By: critson

Differential Revision: https://reviews.llvm.org/D118096

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
    llvm/test/CodeGen/AMDGPU/wqm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 77ee3c0ff0e4..46efb3c605c6 100644

--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -861,12 +861,16 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
   MachineInstr *VcmpMI;
   const MachineOperand &Op0 = MI.getOperand(0);
   const MachineOperand &Op1 = MI.getOperand(1);
+
+  // VCC represents lanes killed.
+  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+
   if (TRI->isVGPR(*MRI, Op0.getReg())) {
     Opcode = AMDGPU::getVOPe32(Opcode);
     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
   } else {
     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
-                 .addReg(AMDGPU::VCC, RegState::Define)
+                 .addReg(VCC, RegState::Define)
                  .addImm(0) // src0 modifiers
                  .add(Op1)
                  .addImm(0) // src1 modifiers
@@ -874,9 +878,6 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
                  .addImm(0); // omod
   }
 
-  // VCC represents lanes killed.
-  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
-
   MachineInstr *MaskUpdateMI =
       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
           .addReg(LiveMaskReg)

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index bd2f5459003c..8addc51e8eaa 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3330,6 +3330,49 @@ main_body:
   ret float %out
 }
 
+; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
+; vector comparisons in Wave32 mode.
+define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(float addrspace(6)* inreg %0) {
+; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
+; GFX9-W64:       ; %bb.0: ; %main_body
+; GFX9-W64-NEXT:    s_mov_b32 s3, 0x31016fac
+; GFX9-W64-NEXT:    s_mov_b32 s2, 32
+; GFX9-W64-NEXT:    s_mov_b32 s1, 0x8000
+; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
+; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W64-NEXT:    v_cmp_le_f32_e64 vcc, s0, 0
+; GFX9-W64-NEXT:    s_andn2_b64 s[4:5], exec, vcc
+; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB50_1
+; GFX9-W64-NEXT:    s_endpgm
+; GFX9-W64-NEXT:  .LBB50_1:
+; GFX9-W64-NEXT:    s_mov_b64 exec, 0
+; GFX9-W64-NEXT:    exp null off, off, off, off done vm
+; GFX9-W64-NEXT:    s_endpgm
+;
+; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
+; GFX10-W32:       ; %bb.0: ; %main_body
+; GFX10-W32-NEXT:    s_mov_b32 s3, 0x31016fac
+; GFX10-W32-NEXT:    s_mov_b32 s2, 32
+; GFX10-W32-NEXT:    s_mov_b32 s1, 0x8000
+; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
+; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-W32-NEXT:    v_cmp_le_f32_e64 vcc_lo, s0, 0
+; GFX10-W32-NEXT:    s_andn2_b32 s4, exec_lo, vcc_lo
+; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB50_1
+; GFX10-W32-NEXT:    s_endpgm
+; GFX10-W32-NEXT:  .LBB50_1:
+; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-W32-NEXT:    exp null off, off, off, off done vm
+; GFX10-W32-NEXT:    s_endpgm
+main_body:
+  %1 = ptrtoint float addrspace(6)* %0 to i32
+  %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
+  %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
+  %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
+  call void @llvm.amdgcn.kill(i1 %4) #1
+  ret void
+}
+
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
 
@@ -3361,6 +3404,7 @@ declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1,
 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
 
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
@@ -3368,3 +3412,4 @@ attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind readnone convergent }
 attributes #5 = { "amdgpu-ps-wqm-outputs" }
 attributes #6 = { nounwind "InitialPSInputAddr"="2" }
+attributes #7 = { nounwind readnone willreturn }