[llvm] 020a9d7 - [GISel] Add (fsub +-0.0, X) -> fneg combine
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 3 01:21:57 PDT 2022
Author: Pierre van Houtryve
Date: 2022-11-03T08:21:50Z
New Revision: 020a9d7b20a2f405b6fd61be0d9f946da44c79af
URL: https://github.com/llvm/llvm-project/commit/020a9d7b20a2f405b6fd61be0d9f946da44c79af
DIFF: https://github.com/llvm/llvm-project/commit/020a9d7b20a2f405b6fd61be0d9f946da44c79af.diff
LOG: [GISel] Add (fsub +-0.0, X) -> fneg combine
Allows for better matching of VOP3 mods.
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D136442
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir
Modified:
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
llvm/include/llvm/Target/GlobalISel/Combine.td
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
llvm/test/CodeGen/AMDGPU/v_pack.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 515382a8e869e..5c54f0e8ab058 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -696,6 +696,9 @@ class CombinerHelper {
/// (fma fneg(x), fneg(y), z) -> (fma x, y, z)
bool matchRedundantNegOperands(MachineInstr &MI, BuildFnTy &MatchInfo);
+ bool matchFsubToFneg(MachineInstr &MI, Register &MatchInfo);
+ void applyFsubToFneg(MachineInstr &MI, Register &MatchInfo);
+
bool canCombineFMadOrFMA(MachineInstr &MI, bool &AllowFusionGlobally,
bool &HasFMAD, bool &Aggressive,
bool CanReassociate = false);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9f29e9faf385b..dd5d929e615c0 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -856,6 +856,13 @@ def redundant_neg_operands: GICombineRule<
[{ return Helper.matchRedundantNegOperands(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
+// Transform (fsub +-0.0, X) -> (fneg X)
+def fsub_to_fneg: GICombineRule<
+ (defs root:$root, register_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_FSUB):$root,
+ [{ return Helper.matchFsubToFneg(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyFsubToFneg(*${root}, ${matchinfo}); }])>;
+
// Transform (fadd x, (fmul y, z)) -> (fma y, z, x)
// (fadd x, (fmul y, z)) -> (fmad y, z, x)
// Transform (fadd (fmul x, y), z) -> (fma x, y, z)
@@ -1056,7 +1063,8 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
form_bitfield_extract, constant_fold, fabs_fneg_fold,
intdiv_combines, mulh_combines, redundant_neg_operands,
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
- sub_add_reg, select_to_minmax, redundant_binop_in_equality]>;
+ sub_add_reg, select_to_minmax, redundant_binop_in_equality,
+ fsub_to_fneg]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1fea2607c061f..a233936ae9dae 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5190,6 +5190,38 @@ bool CombinerHelper::matchRedundantNegOperands(MachineInstr &MI,
return true;
}
+bool CombinerHelper::matchFsubToFneg(MachineInstr &MI, Register &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+ Register LHS = MI.getOperand(1).getReg();
+ MatchInfo = MI.getOperand(2).getReg();
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+ const auto LHSCst = Ty.isVector()
+ ? getFConstantSplat(LHS, MRI, /* allowUndef */ true)
+ : getFConstantVRegValWithLookThrough(LHS, MRI);
+ if (!LHSCst)
+ return false;
+
+ // -0.0 is always allowed
+ if (LHSCst->Value.isNegZero())
+ return true;
+
+ // +0.0 is only allowed if nsz is set.
+ if (LHSCst->Value.isPosZero())
+ return MI.getFlag(MachineInstr::FmNsz);
+
+ return false;
+}
+
+void CombinerHelper::applyFsubToFneg(MachineInstr &MI, Register &MatchInfo) {
+ Builder.setInstrAndDebugLoc(MI);
+ Register Dst = MI.getOperand(0).getReg();
+ Builder.buildFNeg(
+ Dst, Builder.buildFCanonicalize(MRI.getType(Dst), MatchInfo).getReg(0));
+ eraseInst(MI);
+}
+
/// Checks if \p MI is TargetOpcode::G_FMUL and contractable either
/// due to global flags or MachineInstr flags.
static bool isContractableFMul(MachineInstr &MI, bool AllowFusionGlobally) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir
new file mode 100644
index 0000000000000..2bce205735299
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir
@@ -0,0 +1,387 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: test_f16_poszero_nsz
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_f16_poszero_nsz
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %input:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: %res:_(s32) = G_ANYEXT [[FCANONICALIZE]](s16)
+ ; CHECK-NEXT: $vgpr0 = COPY %res(s32)
+ %0:_(s32) = COPY $vgpr0
+ %input:_(s16) = G_TRUNC %0
+ %cst:_(s16) = G_FCONSTANT half 0.0
+ %sub:_(s16) = nsz G_FSUB %cst, %input
+ %res:_(s32) = G_ANYEXT %sub
+ $vgpr0 = COPY %res
+...
+
+---
+name: test_f16_poszero_nonsz_nofold
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_f16_poszero_nonsz_nofold
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %input:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: %cst:_(s16) = G_FCONSTANT half 0xH0000
+ ; CHECK-NEXT: %sub:_(s16) = G_FSUB %cst, %input
+ ; CHECK-NEXT: %res:_(s32) = G_ANYEXT %sub(s16)
+ ; CHECK-NEXT: $vgpr0 = COPY %res(s32)
+ %0:_(s32) = COPY $vgpr0
+ %input:_(s16) = G_TRUNC %0
+ %cst:_(s16) = G_FCONSTANT half 0.0
+ %sub:_(s16) = G_FSUB %cst, %input
+ %res:_(s32) = G_ANYEXT %sub
+ $vgpr0 = COPY %res
+...
+
+---
+name: test_f16_negzero
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_f16_negzero
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %input:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: %res:_(s32) = G_ANYEXT [[FCANONICALIZE]](s16)
+ ; CHECK-NEXT: $vgpr0 = COPY %res(s32)
+ %0:_(s32) = COPY $vgpr0
+ %input:_(s16) = G_TRUNC %0
+ %cst:_(s16) = G_FCONSTANT half -0.0
+ %sub:_(s16) = G_FSUB %cst, %input
+ %res:_(s32) = G_ANYEXT %sub
+ $vgpr0 = COPY %res
+...
+
+---
+name: test_f32_poszero_nsz
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_f32_poszero_nsz
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[FCANONICALIZE]](s32)
+ %input:_(s32) = COPY $vgpr0
+ %cst:_(s32) = G_FCONSTANT float 0.0
+ %sub:_(s32) = nsz G_FSUB %cst, %input
+ $vgpr0 = COPY %sub
+...
+
+---
+name: test_f32_poszero_nonsz_nofold
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_f32_poszero_nonsz_nofold
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 0.000000e+00
+ ; CHECK-NEXT: %sub:_(s32) = G_FSUB %cst, %input
+ ; CHECK-NEXT: $vgpr0 = COPY %sub(s32)
+ %input:_(s32) = COPY $vgpr0
+ %cst:_(s32) = G_FCONSTANT float 0.0
+ %sub:_(s32) = G_FSUB %cst, %input
+ $vgpr0 = COPY %sub
+...
+
+---
+name: test_f32_negzero
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_f32_negzero
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[FCANONICALIZE]](s32)
+ %input:_(s32) = COPY $vgpr0
+ %cst:_(s32) = G_FCONSTANT float -0.0
+ %sub:_(s32) = G_FSUB %cst, %input
+ $vgpr0 = COPY %sub
+...
+
+---
+name: test_f64_poszero_nsz
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: test_f64_poszero_nsz
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(s64) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](s64)
+ %input:_(s64) = COPY $vgpr0_vgpr1
+ %cst:_(s64) = G_FCONSTANT double 0.0
+ %sub:_(s64) = nsz G_FSUB %cst, %input
+ $vgpr0_vgpr1 = COPY %sub
+...
+
+---
+name: test_f64_poszero_nonsz_nofold
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: test_f64_poszero_nonsz_nofold
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(s64) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: %cst:_(s64) = G_FCONSTANT double 0.000000e+00
+ ; CHECK-NEXT: %sub:_(s64) = G_FSUB %cst, %input
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %sub(s64)
+ %input:_(s64) = COPY $vgpr0_vgpr1
+ %cst:_(s64) = G_FCONSTANT double 0.0
+ %sub:_(s64) = G_FSUB %cst, %input
+ $vgpr0_vgpr1 = COPY %sub
+...
+
+---
+name: test_f64_negzero
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: test_f64_negzero
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(s64) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](s64)
+ %input:_(s64) = COPY $vgpr0_vgpr1
+ %cst:_(s64) = G_FCONSTANT double -0.0
+ %sub:_(s64) = G_FSUB %cst, %input
+ $vgpr0_vgpr1 = COPY %sub
+...
+
+---
+name: test_v4f16_poszero_nsz
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: test_v4f16_poszero_nsz
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s16>) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s16>) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](<4 x s16>)
+ %input:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ %cst:_(s16) = G_FCONSTANT half 0.0
+ %veccst:_(<4 x s16>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst
+ %sub:_(<4 x s16>) = nsz G_FSUB %veccst, %input
+ $vgpr0_vgpr1 = COPY %sub
+...
+
+---
+name: test_v4f16_poszero_nonsz_nofold
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: test_v4f16_poszero_nonsz_nofold
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: %cst:_(s16) = G_FCONSTANT half 0xH0000
+ ; CHECK-NEXT: %veccst:_(<4 x s16>) = G_BUILD_VECTOR %cst(s16), %cst(s16), %cst(s16), %cst(s16)
+ ; CHECK-NEXT: %sub:_(<4 x s16>) = G_FSUB %veccst, %input
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %sub(<4 x s16>)
+ %input:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ %cst:_(s16) = G_FCONSTANT half 0.0
+ %veccst:_(<4 x s16>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst
+ %sub:_(<4 x s16>) = G_FSUB %veccst, %input
+ $vgpr0_vgpr1 = COPY %sub
+...
+
+---
+name: test_v4f16_negzero
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: test_v4f16_negzero
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s16>) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s16>) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](<4 x s16>)
+ %input:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ %cst:_(s16) = G_FCONSTANT half -0.0
+ %veccst:_(<4 x s16>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst
+ %sub:_(<4 x s16>) = G_FSUB %veccst, %input
+ $vgpr0_vgpr1 = COPY %sub
+...
+
+---
+name: test_v4f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: test_v4f32
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s32>) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s32>) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<4 x s32>)
+ %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %cst:_(s32) = G_FCONSTANT float 0.0
+ %veccst:_(<4 x s32>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst
+ %sub:_(<4 x s32>) = nsz G_FSUB %veccst, %input
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub
+...
+
+---
+name: test_v4f32_negzero
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: test_v4f32_negzero
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s32>) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s32>) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<4 x s32>)
+ %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %cst:_(s32) = G_FCONSTANT float -0.0
+ %veccst:_(<4 x s32>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst
+ %sub:_(<4 x s32>) = G_FSUB %veccst, %input
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub
+...
+
+---
+name: test_v4f32_negzero_undef_elt
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: test_v4f32_negzero_undef_elt
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s32>) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s32>) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<4 x s32>)
+ %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %cst:_(s32) = G_FCONSTANT float -0.0
+ %undef:_(s32) = G_IMPLICIT_DEF
+ %veccst:_(<4 x s32>) = G_BUILD_VECTOR %cst, %undef, %cst, %cst
+ %sub:_(<4 x s32>) = G_FSUB %veccst, %input
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub
+...
+
+---
+name: test_v4f32_poszero_undef_elt
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: test_v4f32_poszero_undef_elt
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s32>) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s32>) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<4 x s32>)
+ %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %cst:_(s32) = G_FCONSTANT float 0.0
+ %undef:_(s32) = G_IMPLICIT_DEF
+ %veccst:_(<4 x s32>) = G_BUILD_VECTOR %cst, %undef, %cst, %cst
+ %sub:_(<4 x s32>) = nsz G_FSUB %veccst, %input
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub
+...
+
+---
+name: test_v2f64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: test_v2f64
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s64>) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s64>) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<2 x s64>)
+ %input:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %cst:_(s64) = G_FCONSTANT double 0.0
+ %veccst:_(<2 x s64>) = G_BUILD_VECTOR %cst, %cst
+ %sub:_(<2 x s64>) = nsz G_FSUB %veccst, %input
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub
+...
+
+---
+name: test_v2f64_negzero
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: test_v2f64_negzero
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %input:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s64>) = G_FNEG %input
+ ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s64>) = G_FCANONICALIZE [[FNEG]]
+ ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<2 x s64>)
+ %input:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %cst:_(s64) = G_FCONSTANT double -0.0
+ %veccst:_(<2 x s64>) = G_BUILD_VECTOR %cst, %cst
+ %sub:_(<2 x s64>) = G_FSUB %veccst, %input
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
index 2e9a66c579cbe..92961ab1c4dda 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
@@ -23,7 +23,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2
+; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; SI-NEXT: v_med3_f32 v2, v2, v3, v4
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -56,7 +56,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7
+; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7
; VI-NEXT: v_med3_f32 v2, v4, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -72,7 +72,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
+; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -88,7 +88,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
+; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -104,7 +104,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
+; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -145,7 +145,7 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2
+; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: v_min_f32_e32 v5, v2, v3
; SI-NEXT: v_max_f32_e32 v2, v2, v3
@@ -183,7 +183,7 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7
+; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; VI-NEXT: v_min_f32_e32 v5, v4, v2
; VI-NEXT: v_max_f32_e32 v2, v4, v2
@@ -204,7 +204,7 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
+; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
@@ -225,7 +225,7 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
+; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
; GFX10-NEXT: v_max_f32_e32 v4, v1, v2
@@ -246,7 +246,8 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_sub_f32 v1, 0x80000000, v1 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f32_e32 v4, v1, v2
; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
@@ -289,9 +290,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: s_mov_b32 s2, 0x80000000
-; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2
-; SI-NEXT: v_sub_f32_e64 v4, s2, |v4|
+; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2
+; SI-NEXT: v_mul_f32_e64 v4, -1.0, |v4|
; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -320,13 +320,12 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s2, 0x80000000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7
-; VI-NEXT: v_sub_f32_e64 v3, s2, |v3|
+; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7
+; VI-NEXT: v_mul_f32_e64 v3, -1.0, |v3|
; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -342,9 +341,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s2, 0x80000000
-; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
-; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3|
+; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1
+; GFX9-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -360,8 +358,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
-; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3|
+; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1
+; GFX10-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -377,8 +375,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
-; GFX11-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3|
+; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1
+; GFX11-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, |v2|, v3
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -425,10 +423,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: s_mov_b32 s2, 0x80000000
-; SI-NEXT: v_sub_f32_e64 v2, s2, |v2|
-; SI-NEXT: v_sub_f32_e64 v3, s2, |v3|
-; SI-NEXT: v_sub_f32_e64 v4, s2, |v4|
+; SI-NEXT: v_mul_f32_e64 v2, -1.0, |v2|
+; SI-NEXT: v_mul_f32_e64 v3, -1.0, |v3|
+; SI-NEXT: v_mul_f32_e64 v4, -1.0, |v4|
; SI-NEXT: v_med3_f32 v2, v2, v3, v4
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -457,14 +454,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v3, v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s2, 0x80000000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_sub_f32_e64 v4, s2, |v7|
-; VI-NEXT: v_sub_f32_e64 v2, s2, |v2|
-; VI-NEXT: v_sub_f32_e64 v3, s2, |v3|
+; VI-NEXT: v_mul_f32_e64 v4, -1.0, |v7|
+; VI-NEXT: v_mul_f32_e64 v2, -1.0, |v2|
+; VI-NEXT: v_mul_f32_e64 v3, -1.0, |v3|
; VI-NEXT: v_med3_f32 v2, v4, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -480,10 +476,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s2, 0x80000000
-; GFX9-NEXT: v_sub_f32_e64 v1, s2, |v1|
-; GFX9-NEXT: v_sub_f32_e64 v2, s2, |v2|
-; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3|
+; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
+; GFX9-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -499,9 +494,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_sub_f32_e64 v1, 0x80000000, |v1|
-; GFX10-NEXT: v_sub_f32_e64 v2, 0x80000000, |v2|
-; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3|
+; GFX10-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX10-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
+; GFX10-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -517,9 +512,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sub_f32_e64 v1, 0x80000000, |v1|
-; GFX11-NEXT: v_sub_f32_e64 v2, 0x80000000, |v2|
-; GFX11-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3|
+; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX11-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
+; GFX11-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index e17d38cff6332..1fbf9593aceea 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -223,9 +223,7 @@ define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspa
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT: v_sub_f16_e32 v0, 0x8000, v0
-; GISEL-NEXT: v_sub_f16_e32 v1, 0x8000, v1
-; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT: v_pack_b32_f16 v0, -v0, -v1
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; use v0
; GISEL-NEXT: ;;#ASMEND
More information about the llvm-commits
mailing list