[llvm] fb7be0d - AMDGPU/GlobalISel: Remove redundant G_FCANONICALIZE
Petar Avramovic via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 27 03:27:06 PDT 2021
Author: Petar Avramovic
Date: 2021-04-27T12:26:37+02:00
New Revision: fb7be0d912cbcba68803456fbde3fd311b3922ed
URL: https://github.com/llvm/llvm-project/commit/fb7be0d912cbcba68803456fbde3fd311b3922ed
DIFF: https://github.com/llvm/llvm-project/commit/fb7be0d912cbcba68803456fbde3fd311b3922ed.diff
LOG: AMDGPU/GlobalISel: Remove redundant G_FCANONICALIZE
Add basic version of isCanonicalized for global-isel. Copied from sdag.
Add post legalizer combine that deletes G_FCANONICALIZE when its input
is already Canonicalized.
Differential Revision: https://reviews.llvm.org/D96605
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCombine.td
llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9b71428e7fea..c6273adca50f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -56,6 +56,14 @@ def int_minmax_to_med3 : GICombineRule<
[{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
(apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">;
+
+def remove_fcanonicalize : GICombineRule<
+ (defs root:$fcanonicalize, remove_fcanonicalize_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_FCANONICALIZE):$fcanonicalize,
+ [{ return PostLegalizerHelper.matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]),
+ (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>;
+
// Combines which should only apply on SI/VI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
@@ -68,7 +76,7 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPostLegalizerCombinerHelper",
[all_combines, gfx6gfx7_combines,
- uchar_to_float, cvt_f32_ubyteN]> {
+ uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> {
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
let AdditionalArguments = [];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 09e2c762abdb..728be811afae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -66,6 +66,8 @@ class AMDGPUPostLegalizerCombinerHelper {
bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
void applyCvtF32UByteN(MachineInstr &MI,
const CvtF32UByteMatchInfo &MatchInfo);
+
+ bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
};
bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
@@ -245,6 +247,14 @@ void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
MI.eraseFromParent();
}
+bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
+ MachineInstr &MI, Register &Reg) {
+ const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+ MF.getSubtarget().getTargetLowering());
+ Reg = MI.getOperand(1).getReg();
+ return TLI->isCanonicalized(Reg, MF);
+}
+
class AMDGPUPostLegalizerCombinerHelperState {
protected:
CombinerHelper &Helper;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ae68c96d5e92..c16e9d46341f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9649,6 +9649,45 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
llvm_unreachable("invalid operation");
}
+bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
+ unsigned MaxDepth) const {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ unsigned Opcode = MI->getOpcode();
+
+ if (Opcode == AMDGPU::G_FCANONICALIZE)
+ return true;
+
+ if (Opcode == AMDGPU::G_FCONSTANT) {
+ auto F = MI->getOperand(1).getFPImm()->getValueAPF();
+ if (F.isNaN() && F.isSignaling())
+ return false;
+ return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF);
+ }
+
+ if (MaxDepth == 0)
+ return false;
+
+ switch (Opcode) {
+ case AMDGPU::G_FMINNUM_IEEE:
+ case AMDGPU::G_FMAXNUM_IEEE: {
+ if (Subtarget->supportsMinMaxDenormModes() ||
+ denormalsEnabledForType(MRI.getType(Reg), MF))
+ return true;
+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) {
+ if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1))
+ return false;
+ }
+ return true;
+ }
+ default:
+ return denormalsEnabledForType(MRI.getType(Reg), MF) &&
+ isKnownNeverSNaN(Reg, MRI);
+ }
+
+ llvm_unreachable("invalid operation");
+}
+
// Constant fold canonicalize.
SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
@@ -12014,6 +12053,19 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
}
}
+bool SITargetLowering::denormalsEnabledForType(LLT Ty,
+ MachineFunction &MF) const {
+ switch (Ty.getScalarSizeInBits()) {
+ case 32:
+ return hasFP32Denormals(MF);
+ case 64:
+ case 16:
+ return hasFP64FP16Denormals(MF);
+ default:
+ return false;
+ }
+}
+
bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 9109d10df04d..b7e13774e4d4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -444,7 +444,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
+ bool isCanonicalized(Register Reg, MachineFunction &MF,
+ unsigned MaxDepth = 5) const;
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
+ bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const;
bool isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
new file mode 100644
index 000000000000..eaeca67d7654
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
@@ -0,0 +1,223 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: test_fcanonicalize
+tracksRegLiveness: true
+legalized: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_fcanonicalize
+ ; CHECK: liveins: $vgpr0
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
+ ; CHECK: $vgpr0 = COPY [[FCANONICALIZE]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = G_FCANONICALIZE %0
+ %2:_(s32) = G_FCANONICALIZE %1
+ $vgpr0 = COPY %2(s32)
+...
+
+---
+name: test_fconstant
+tracksRegLiveness: true
+legalized: true
+body: |
+ bb.0:
+
+ ; CHECK-LABEL: name: test_fconstant
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+10
+ ; CHECK: $vgpr0 = COPY [[C]](s32)
+ %0:_(s32) = G_FCONSTANT float 1.0e10
+ %1:_(s32) = G_FCANONICALIZE %0
+ $vgpr0 = COPY %1(s32)
+...
+
+---
+name: test_denormal_fconstant
+tracksRegLiveness: true
+legalized: true
+machineFunctionInfo:
+ mode:
+ fp64-fp16-output-denormals: false
+ fp64-fp16-input-denormals: false
+body: |
+ bb.0:
+
+ ; CHECK-LABEL: name: test_denormal_fconstant
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.618950e-319
+ ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[C]]
+ ; CHECK: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](s64)
+ %0:_(s64) = G_FCONSTANT double 0x0000000000008000
+ %1:_(s64) = G_FCANONICALIZE %0
+ $vgpr0_vgpr1 = COPY %1(s64)
+...
+
+---
+name: test_fminnum_with_fminnum_argument_s32_ieee_mode_on
+tracksRegLiveness: true
+legalized: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: test_fminnum_with_fminnum_argument_s32_ieee_mode_on
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
+ ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
+ ; CHECK: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
+ ; CHECK: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FMINNUM_IEEE]], [[FCANONICALIZE2]]
+ ; CHECK: $vgpr0 = COPY [[FMINNUM_IEEE1]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %7:_(s32) = G_FCANONICALIZE %0
+ %8:_(s32) = G_FCANONICALIZE %1
+ %2:_(s32) = G_FMINNUM_IEEE %7, %8
+ %3:_(s32) = COPY $vgpr2
+ %5:_(s32) = G_FCANONICALIZE %2
+ %6:_(s32) = G_FCANONICALIZE %3
+ %4:_(s32) = G_FMINNUM_IEEE %5, %6
+ $vgpr0 = COPY %4(s32)
+...
+
+---
+name: test_fminnum_with_fmaxnum_argument_s32_ieee_mode_on
+tracksRegLiveness: true
+legalized: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: test_fminnum_with_fmaxnum_argument_s32_ieee_mode_on
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
+ ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
+ ; CHECK: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
+ ; CHECK: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], [[FCANONICALIZE2]]
+ ; CHECK: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %7:_(s32) = G_FCANONICALIZE %0
+ %8:_(s32) = G_FCANONICALIZE %1
+ %2:_(s32) = G_FMAXNUM_IEEE %7, %8
+ %3:_(s32) = COPY $vgpr2
+ %5:_(s32) = G_FCANONICALIZE %2
+ %6:_(s32) = G_FCANONICALIZE %3
+ %4:_(s32) = G_FMINNUM_IEEE %5, %6
+ $vgpr0 = COPY %4(s32)
+...
+
+---
+name: test_fmaxnum_with_fmaxnum_argument_s32_ieee_mode_on
+tracksRegLiveness: true
+legalized: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: test_fmaxnum_with_fmaxnum_argument_s32_ieee_mode_on
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
+ ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
+ ; CHECK: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
+ ; CHECK: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMAXNUM_IEEE]], [[FCANONICALIZE2]]
+ ; CHECK: $vgpr0 = COPY [[FMAXNUM_IEEE1]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %7:_(s32) = G_FCANONICALIZE %0
+ %8:_(s32) = G_FCANONICALIZE %1
+ %2:_(s32) = G_FMAXNUM_IEEE %7, %8
+ %3:_(s32) = COPY $vgpr2
+ %5:_(s32) = G_FCANONICALIZE %2
+ %6:_(s32) = G_FCANONICALIZE %3
+ %4:_(s32) = G_FMAXNUM_IEEE %5, %6
+ $vgpr0 = COPY %4(s32)
+...
+
+---
+name: test_fmaxnum_with_fminnum_argument_s32_ieee_mode_on
+tracksRegLiveness: true
+legalized: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: test_fmaxnum_with_fminnum_argument_s32_ieee_mode_on
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
+ ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
+ ; CHECK: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
+ ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
+ ; CHECK: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[FCANONICALIZE2]]
+ ; CHECK: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %7:_(s32) = G_FCANONICALIZE %0
+ %8:_(s32) = G_FCANONICALIZE %1
+ %2:_(s32) = G_FMINNUM_IEEE %7, %8
+ %3:_(s32) = COPY $vgpr2
+ %5:_(s32) = G_FCANONICALIZE %2
+ %6:_(s32) = G_FCANONICALIZE %3
+ %4:_(s32) = G_FMAXNUM_IEEE %5, %6
+ $vgpr0 = COPY %4(s32)
+...
+
+---
+name: test_multiple_uses
+tracksRegLiveness: true
+legalized: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: test_multiple_uses
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
+ ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
+ ; CHECK: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
+ ; CHECK: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[FMINNUM_IEEE]]
+ ; CHECK: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %6:_(s32) = G_FCANONICALIZE %0
+ %7:_(s32) = G_FCANONICALIZE %1
+ %2:_(s32) = G_FMINNUM_IEEE %6, %7
+ %4:_(s32) = G_FCANONICALIZE %2
+ %5:_(s32) = G_FCANONICALIZE %2
+ %3:_(s32) = G_FMAXNUM_IEEE %4, %5
+ $vgpr0 = COPY %3(s32)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
index dfb25de27326..295c3645dac6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
@@ -132,11 +132,8 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; SI-NEXT: v_min_f32_e32 v5, v2, v3
; SI-NEXT: v_max_f32_e32 v2, v2, v3
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_min_f32_e32 v2, v2, v3
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-NEXT: v_max_f32_e32 v2, v5, v2
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -174,11 +171,8 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; VI-NEXT: v_min_f32_e32 v5, v4, v2
; VI-NEXT: v_max_f32_e32 v2, v4, v2
; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; VI-NEXT: v_min_f32_e32 v2, v2, v3
-; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; VI-NEXT: v_max_f32_e32 v2, v3, v2
+; VI-NEXT: v_max_f32_e32 v2, v5, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -198,12 +192,9 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_max_f32_e32 v2, v4, v4
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX9-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -224,10 +215,7 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
; GFX10-NEXT: v_max_f32_e32 v4, v1, v2
; GFX10-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
; GFX10-NEXT: v_min_f32_e32 v2, v4, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
; GFX10-NEXT: v_max_f32_e32 v1, v1, v2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -640,11 +628,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_min_f32_e32 v2, v2, v3
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_max_f32_e32 v2, v3, v2
+; SI-NEXT: v_max_f32_e32 v2, v5, v2
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
@@ -679,13 +664,10 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; VI-NEXT: v_min_f32_e32 v5, v4, v2
-; VI-NEXT: v_max_f32_e32 v2, v4, v2
; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; VI-NEXT: v_max_f32_e32 v2, v4, v2
; VI-NEXT: v_min_f32_e32 v2, v2, v3
-; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; VI-NEXT: v_max_f32_e32 v2, v3, v2
+; VI-NEXT: v_max_f32_e32 v2, v5, v2
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -706,14 +688,11 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
; GFX9-NEXT: global_store_dword v[0:1], v4, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_max_f32_e32 v2, v4, v4
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v1
+; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -733,11 +712,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
; GFX10-NEXT: v_max_f32_e32 v4, v1, v2
; GFX10-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
; GFX10-NEXT: v_min_f32_e32 v2, v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v1, v1
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v2
+; GFX10-NEXT: v_max_f32_e32 v2, v1, v2
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
More information about the llvm-commits
mailing list