[llvm] 0b46b07 - AMDGPU/GlobalISel: Fix incorrect VOP3P fneg folding

Mon Feb 24 18:22:36 PST 2020

Author: Matt Arsenault
Date: 2020-02-24T21:20:35-05:00
New Revision: 0b46b078b656eacaf8fb0b550825189a051f0744

URL: https://github.com/llvm/llvm-project/commit/0b46b078b656eacaf8fb0b550825189a051f0744
DIFF: https://github.com/llvm/llvm-project/commit/0b46b078b656eacaf8fb0b550825189a051f0744.diff

LOG: AMDGPU/GlobalISel: Fix incorrect VOP3P fneg folding

We use some s32 values in VOP3P operands, and won't see any
intervening casts from a 32-bit fneg. Make sure it's really a packed
fneg before folding.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e1ca71db2e4d..4a643e50e2db 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2479,7 +2479,10 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
   unsigned Mods = 0;
   MachineInstr *MI = MRI.getVRegDef(Src);
 
-  if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
+  if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
+      // It's possible to see an f32 fneg here, but unlikely.
+      // TODO: Treat f32 fneg as only high bit.
+      MRI.getType(Src) == LLT::vector(2, 16)) {
     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
     Src = MI->getOperand(1).getReg();
     MI = MRI.getVRegDef(Src);

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
index f792ed8f3585..b22f188f5bfa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@@ -101,15 +101,17 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX906-LABEL: v_fdot2_neg_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX906-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdot2_neg_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg float %c
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index a72fa3e10336..4f9fb6d5d592 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -276,21 +276,24 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
 ; GFX906-LABEL: v_sdot2_fnegf32_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_fnegf32_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_fnegf32_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg float %c
   %cast.neg.c = bitcast float %neg.c to i32

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 7413eb6d1378..e9404e701be7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -98,15 +98,17 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot4_fnegf32_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot4_fnegf32_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg float %a
   %cast.neg.a = bitcast float %neg.a to i32

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
index 2043d6fe4109..c24c2908ce90 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -51,15 +51,17 @@ define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot8_fnegf32_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot8_fnegf32_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg float %a
   %cast.neg.a = bitcast float %neg.a to i32

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index 738454a5755c..a6a85bbb5c7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -276,21 +276,24 @@ define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
 ; GFX906-LABEL: v_udot2_fnegf32_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_fnegf32_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_fnegf32_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg float %c
   %cast.neg.c = bitcast float %neg.c to i32

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index 9b20c00a0b46..7531214c4d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -98,15 +98,17 @@ define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot4_fnegf32_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot4_fnegf32_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg float %a
   %cast.neg.a = bitcast float %neg.a to i32

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
index 5c12bda16f9a..32d4a2144894 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -51,15 +51,17 @@ define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot8_fnegf32_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot8_fnegf32_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg float %a
   %cast.neg.a = bitcast float %neg.a to i32