[llvm] r286931 - AMDGPU: Fix f16 fabs/fneg

Mon Nov 14 18:25:29 PST 2016

Author: arsenm
Date: Mon Nov 14 20:25:28 2016
New Revision: 286931

URL: http://llvm.org/viewvc/llvm-project?rev=286931&view=rev
Log:
AMDGPU: Fix f16 fabs/fneg

Added:
    llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/test/CodeGen/AMDGPU/fneg.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=286931&r1=286930&r2=286931&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Mon Nov 14 20:25:28 2016
@@ -558,13 +558,12 @@ bool AMDGPUTargetLowering::isCheapToSpec
 
 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
   assert(VT.isFloatingPoint());
-  return VT == MVT::f32 || VT == MVT::f64;
+  return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
+                                              VT == MVT::f16);
 }
 
 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
-  assert(VT.isFloatingPoint());
-  return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
-                                              VT == MVT::f16);
+  return isFAbsFree(VT);
 }
 
 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=286931&r1=286930&r2=286931&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Mon Nov 14 20:25:28 2016
@@ -665,6 +665,21 @@ def : Pat <
     sub1)
 >;
 
+def : Pat <
+  (fneg f16:$src),
+  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
+>;
+
+def : Pat <
+  (fabs f16:$src),
+  (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
+>;
+
+def : Pat <
+  (fneg (fabs f16:$src)),
+  (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
+>;
+
 /********** ================== **********/
 /********** Immediate Patterns **********/
 /********** ================== **********/

Added: llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll?rev=286931&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll Mon Nov 14 20:25:28 2016
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; DAGCombiner will transform:
+; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
+; unless isFabsFree returns true
+
+; GCN-LABEL: {{^}}fabs_free_f16:
+; GCN: flat_load_ushort [[VAL:v[0-9]+]],
+; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
+; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+define void @fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+  %bc= bitcast i16 %in to half
+  %fabs = call half @llvm.fabs.f16(half %bc)
+  store half %fabs, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_f16:
+; CI: flat_load_ushort [[VAL:v[0-9]+]],
+; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]]
+; CI: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], |[[CVT0]]|
+; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define void @fabs_f16(half addrspace(1)* %out, half %in) {
+  %fabs = call half @llvm.fabs.f16(half %in)
+  store half %fabs, half addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should be able to use single and
+; GCN-LABEL: {{^}}fabs_v2f16:
+; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+
+; VI: flat_load_ushort [[LO:v[0-9]+]]
+; VI: flat_load_ushort [[HI:v[0-9]+]]
+; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
+; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[LO]]
+; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
+; VI: v_or_b32
+; VI: flat_store_dword
+define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  store <2 x half> %fabs, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_v4f16:
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+
+; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
+; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+
+; GCN: flat_store_dwordx2
+define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+  %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
+  store <4 x half> %fabs, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fold_f16:
+; GCN: flat_load_ushort [[IN0:v[0-9]+]]
+; GCN: flat_load_ushort [[IN1:v[0-9]+]]
+; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
+; CI-DAG: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], [[IN1]]
+; CI: v_mul_f32_e64 [[RESULT:v[0-9]+]],  |[[CVT1]]|, [[CVT0]]
+; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
+; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
+
+; VI-NOT: and
+; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]]
+; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
+  %fabs = call half @llvm.fabs.f16(half %in0)
+  %fmul = fmul half %fabs, %in1
+  store half %fmul, half addrspace(1)* %out
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) readnone
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone

Added: llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll?rev=286931&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll Mon Nov 14 20:25:28 2016
@@ -0,0 +1,113 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+
+; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
+; CI: v_cvt_f32_f16_e32
+; CI: v_cvt_f32_f16_e32
+; CI: v_sub_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |v{{[0-9]+}}|
+
+; VI-NOT: and
+; VI: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
+define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
+  %fabs = call half @llvm.fabs.f16(half %x)
+  %fsub = fsub half -0.000000e+00, %fabs
+  %fadd = fadd half %y, %fsub
+  store half %fadd, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fabs_fmul_f16:
+; CI: v_cvt_f32_f16_e32
+; CI: v_cvt_f32_f16_e32
+; CI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}|
+; CI: v_cvt_f16_f32_e32
+
+; VI-NOT: and
+; VI: v_mul_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}|
+; VI-NOT: and
+define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
+  %fabs = call half @llvm.fabs.f16(half %x)
+  %fsub = fsub half -0.000000e+00, %fabs
+  %fmul = fmul half %y, %fsub
+  store half %fmul, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
+; unless isFabsFree returns true
+
+; GCN-LABEL: {{^}}fneg_fabs_free_f16:
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+  %bc = bitcast i16 %in to half
+  %fabs = call half @llvm.fabs.f16(half %bc)
+  %fsub = fsub half -0.000000e+00, %fabs
+  store half %fsub, half addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should use or
+; GCN-LABEL: {{^}}fneg_fabs_f16:
+; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+
+; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
+  %fabs = call half @llvm.fabs.f16(half %in)
+  %fsub = fsub half -0.000000e+00, %fabs
+  store half %fsub, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fabs_f16:
+; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+
+; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+  %val = load half, half addrspace(1)* %in, align 2
+  %fabs = call half @llvm.fabs.f16(half %val)
+  %fsub = fsub half -0.000000e+00, %fabs
+  store half %fsub, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; FIXME: single bit op
+; GCN-LABEL: {{^}}fneg_fabs_v2f16:
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+
+; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: flat_store_dword
+define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  %fsub = fsub <2 x half> <half -0.000000e+00, half -0.000000e+00>, %fabs
+  store <2 x half> %fsub, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fabs_v4f16:
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+
+; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: flat_store_dwordx2
+define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+  %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
+  %fsub = fsub <4 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %fabs
+  store <4 x half> %fsub, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) readnone
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone

Added: llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll?rev=286931&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll Mon Nov 14 20:25:28 2016
@@ -0,0 +1,61 @@
+; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+
+; FIXME: Should be able to do scalar op
+; FUNC-LABEL: {{^}}s_fneg_f16:
+
+define void @s_fneg_f16(half addrspace(1)* %out, half %in) {
+  %fneg = fsub half -0.000000e+00, %in
+  store half %fneg, half addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should be able to use bit operations when illegal type as
+; well.
+
+; FUNC-LABEL: {{^}}v_fneg_f16:
+; GCN: flat_load_ushort [[VAL:v[0-9]+]],
+
+; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]]
+; CI: v_cvt_f16_f32_e64 [[CVT1:v[0-9]+]], -[[CVT0]]
+; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
+
+; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
+; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
+define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+  %val = load half, half addrspace(1)* %in, align 2
+  %fneg = fsub half -0.000000e+00, %val
+  store half %fneg, half addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_free_f16:
+; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]],
+
+; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
+; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]
+; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
+define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) {
+  %bc = bitcast i16 %in to half
+  %fsub = fsub half -0.0, %bc
+  store half %fsub, half addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_fneg_fold_f16:
+; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]]
+
+; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[CVT0]]
+; CI: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[CVT0]], [[CVT0]]
+; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]]
+; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
+
+; VI-NOT: [[NEG_VALUE]]
+; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+  %val = load half, half addrspace(1)* %in
+  %fsub = fsub half -0.0, %val
+  %fmul = fmul half %fsub, %val
+  store half %fmul, half addrspace(1)* %out
+  ret void
+}

Modified: llvm/trunk/test/CodeGen/AMDGPU/fneg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fneg.ll?rev=286931&r1=286930&r2=286931&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fneg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fneg.ll Mon Nov 14 20:25:28 2016
@@ -1,30 +1,30 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; FUNC-LABEL: {{^}}fneg_f32:
+; FUNC-LABEL: {{^}}s_fneg_f32:
 ; R600: -PV
 
 ; GCN: v_xor_b32
-define void @fneg_f32(float addrspace(1)* %out, float %in) {
+define void @s_fneg_f32(float addrspace(1)* %out, float %in) {
   %fneg = fsub float -0.000000e+00, %in
   store float %fneg, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_v2f32:
+; FUNC-LABEL: {{^}}s_fneg_v2f32:
 ; R600: -PV
 ; R600: -PV
 
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
   %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
   store <2 x float> %fneg, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_v4f32:
+; FUNC-LABEL: {{^}}s_fneg_v4f32:
 ; R600: -PV
 ; R600: -T
 ; R600: -PV
@@ -34,7 +34,7 @@ define void @fneg_v2f32(<2 x float> addr
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
   %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
   store <4 x float> %fneg, <4 x float> addrspace(1)* %out
   ret void
@@ -44,15 +44,31 @@ define void @fneg_v4f32(<4 x float> addr
 ; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
 ; unless the target returns true for isNegFree()
 
-; FUNC-LABEL: {{^}}fneg_free_f32:
+; FUNC-LABEL: {{^}}fsub0_f32:
+
+; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}}
+
 ; R600-NOT: XOR
 ; R600: -KC0[2].Z
+define void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fsub = fsub float 0.0, %bc
+  store float %fsub, float addrspace(1)* %out
+  ret void
+}
+; FUNC-LABEL: {{^}}fneg_free_f32:
+; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 
-; XXX: We could use v_add_f32_e64 with the negate bit here instead.
-; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}}
+; GCN: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}}
+; GCN: v_xor_b32_e32 [[RES:v[0-9]+]], [[NEG_VALUE]], [[SIGNBIT]]
+; GCN: buffer_store_dword [[RES]]
+
+; R600-NOT: XOR
+; R600: -PV.W
 define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
-  %fsub = fsub float 0.0, %bc
+  %fsub = fsub float -0.0, %bc
   store float %fsub, float addrspace(1)* %out
   ret void
 }