[llvm] 0b46b07 - AMDGPU/GlobalISel: Fix incorrect VOP3P fneg folding
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 24 18:22:36 PST 2020
Author: Matt Arsenault
Date: 2020-02-24T21:20:35-05:00
New Revision: 0b46b078b656eacaf8fb0b550825189a051f0744
URL: https://github.com/llvm/llvm-project/commit/0b46b078b656eacaf8fb0b550825189a051f0744
DIFF: https://github.com/llvm/llvm-project/commit/0b46b078b656eacaf8fb0b550825189a051f0744.diff
LOG: AMDGPU/GlobalISel: Fix incorrect VOP3P fneg folding
We use some s32 values in VOP3P operands, and won't see any
intervening casts from a 32-bit fneg. Make sure it's really a packed
fneg before folding.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e1ca71db2e4d..4a643e50e2db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2479,7 +2479,10 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
unsigned Mods = 0;
MachineInstr *MI = MRI.getVRegDef(Src);
- if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
+ if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
+ // It's possible to see an f32 fneg here, but unlikely.
+ // TODO: Treat f32 fneg as only high bit.
+ MRI.getType(Src) == LLT::vector(2, 16)) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = MI->getOperand(1).getReg();
MI = MRI.getVRegDef(Src);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
index f792ed8f3585..b22f188f5bfa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@@ -101,15 +101,17 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_neg_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdot2_neg_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index a72fa3e10336..4f9fb6d5d592 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -276,21 +276,24 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX906-LABEL: v_sdot2_fnegf32_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fnegf32_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_fnegf32_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%cast.neg.c = bitcast float %neg.c to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 7413eb6d1378..e9404e701be7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -98,15 +98,17 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
index 2043d6fe4109..c24c2908ce90 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -51,15 +51,17 @@ define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot8_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index 738454a5755c..a6a85bbb5c7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -276,21 +276,24 @@ define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX906-LABEL: v_udot2_fnegf32_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_fnegf32_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_fnegf32_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%cast.neg.c = bitcast float %neg.c to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index 9b20c00a0b46..7531214c4d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -98,15 +98,17 @@ define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot4_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
index 5c12bda16f9a..32d4a2144894 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -51,15 +51,17 @@ define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot8_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
More information about the llvm-commits
mailing list