[llvm] ca57b80 - Code quality: Combine V_RSQ
Mateja Marjanovic via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 30 08:17:57 PST 2021
Author: Mateja Marjanovic
Date: 2021-11-30T17:17:15+01:00
New Revision: ca57b80cd6767b97477fd157831a2b099b5f8f75
URL: https://github.com/llvm/llvm-project/commit/ca57b80cd6767b97477fd157831a2b099b5f8f75
DIFF: https://github.com/llvm/llvm-project/commit/ca57b80cd6767b97477fd157831a2b099b5f8f75.diff
LOG: Code quality: Combine V_RSQ
Combine V_RCP and V_SQRT into V_RSQ on AMDGPU for GlobalISel.
Change-Id: I93c5dcb412483156a6e8b68c4085cbce83ac9703
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
Modified:
llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
llvm/lib/Target/AMDGPU/AMDGPUCombine.td
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
llvm/lib/Target/AMDGPU/CaymanInstructions.td
llvm/lib/Target/AMDGPU/EvergreenInstructions.td
llvm/lib/Target/AMDGPU/R600Instructions.td
llvm/lib/Target/AMDGPU/SIInstructions.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 84aad9858b8b4..a41166bb4c6bb 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -526,6 +526,11 @@ inline UnaryOp_match<SrcTy, TargetOpcode::COPY> m_Copy(SrcTy &&Src) {
return UnaryOp_match<SrcTy, TargetOpcode::COPY>(std::forward<SrcTy>(Src));
}
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT> m_GFSqrt(const SrcTy &Src) {
+ return UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT>(Src);
+}
+
// General helper for generic MI compares, i.e. G_ICMP and G_FCMP
// TODO: Allow checking a specific predicate.
template <typename Pred_P, typename LHS_P, typename RHS_P, unsigned Opcode>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index df2f9a0fa3a96..c7c5ff7bcbe76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -26,6 +26,14 @@ def uchar_to_float : GICombineRule<
[{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]),
(apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>;
+
+def rcp_sqrt_to_rsq : GICombineRule<
+ (defs root:$rcp, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_INTRINSIC, G_FSQRT):$rcp,
+ [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
+
+
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">;
def cvt_f32_ubyteN : GICombineRule<
@@ -86,7 +94,8 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPostLegalizerCombinerHelper",
[all_combines, gfx6gfx7_combines,
- uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg]> {
+ uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
+ rcp_sqrt_to_rsq]> {
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
let AdditionalArguments = [];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 551ab7fa4de70..0528b552f4753 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -712,11 +712,6 @@ class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat <
(RcpInst $src)
>;
-class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
- (AMDGPUrcp (fsqrt vt:$src)),
- (RsqInst $src)
->;
-
// Instructions which select to the same v_min_f*
def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
[(fminnum_ieee node:$src0, node:$src1),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index fc984d2dda648..1479933a2850b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
@@ -58,6 +59,9 @@ class AMDGPUPostLegalizerCombinerHelper {
bool matchUCharToFloat(MachineInstr &MI);
void applyUCharToFloat(MachineInstr &MI);
+ bool matchRcpSqrtToRsq(MachineInstr &MI,
+ std::function<void(MachineIRBuilder &)> &MatchInfo);
+
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
struct CvtF32UByteMatchInfo {
@@ -203,6 +207,48 @@ void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
MI.eraseFromParent();
}
+bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
+ MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+
+ auto getRcpSrc = [=](const MachineInstr &MI) {
+ MachineInstr *ResMI = nullptr;
+ if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
+ MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
+ ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
+
+ return ResMI;
+ };
+
+ auto getSqrtSrc = [=](const MachineInstr &MI) {
+ MachineInstr *SqrtSrcMI = nullptr;
+ mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
+ return SqrtSrcMI;
+ };
+
+ MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
+ // rcp(sqrt(x))
+ if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
+ MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+ .addUse(SqrtSrcMI->getOperand(0).getReg())
+ .setMIFlags(MI.getFlags());
+ };
+ return true;
+ }
+
+ // sqrt(rcp(x))
+ if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
+ MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+ .addUse(RcpSrcMI->getOperand(0).getReg())
+ .setMIFlags(MI.getFlags());
+ };
+ return true;
+ }
+
+ return false;
+}
+
bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
Register SrcReg = MI.getOperand(1).getReg();
diff --git a/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index f4ddbf1131c34..d18dab0554bd4 100644
--- a/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -48,8 +48,6 @@ def SIN_cm : SIN_Common<0x8D>;
def COS_cm : COS_Common<0x8E>;
} // End isVector = 1
-def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
-
def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;
def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 12224cb3f7979..a9a3421e81924 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -126,7 +126,6 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
-def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;
def SIN_eg : SIN_Common<0x8D>;
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index 4487864888b6b..b3da2fdefacc7 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1265,7 +1265,6 @@ let Predicates = [isR600] in {
defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
- def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;
def R600_ExportSwz : ExportSwzInst {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2872c44c31f9e..d55d8da8699af 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -827,10 +827,6 @@ def : Pat <
let OtherPredicates = [UnsafeFPMath] in {
-//defm : RsqPat<V_RSQ_F32_e32, f32>;
-
-def : RsqPat<V_RSQ_F32_e32, f32>;
-
// Convert (x - floor(x)) to fract(x)
def : GCNPat <
(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
new file mode 100644
index 0000000000000..cf1747ffe00f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+define amdgpu_cs float @div_sqrt(float inreg %arg1) {
+; GCN-LABEL: div_sqrt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = call float @llvm.sqrt.f32(float %arg1)
+ %b = fdiv afn float 1.000000e+00, %a
+ ret float %b
+}
+
+define amdgpu_cs float @sqrt_div(float inreg %arg1) {
+; GCN-LABEL: sqrt_div:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = fdiv afn float 1.000000e+00, %arg1
+ %b = call float @llvm.sqrt.f32(float %a)
+ ret float %b
+}
+
+define amdgpu_cs float @rcp_sqrt(float inreg %arg1) {
+; GCN-LABEL: rcp_sqrt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = call float @llvm.sqrt.f32(float %arg1)
+ %b = call float @llvm.amdgcn.rcp.f32(float %a)
+ ret float %b
+}
+
+define amdgpu_cs float @sqrt_rcp(float inreg %arg1) {
+; GCN-LABEL: sqrt_rcp:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = call float @llvm.amdgcn.rcp.f32(float %arg1)
+ %b = call float @llvm.sqrt.f32(float %a)
+ ret float %b
+}
+
+
+declare float @llvm.sqrt.f32(float)
+declare float @llvm.amdgcn.rcp.f32(float)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
new file mode 100644
index 0000000000000..f85ddbaa3ae7d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: rcp_sqrt_test
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK: $vgpr0 = COPY %3
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GCN-LABEL: name: rcp_sqrt_test
+ ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GCN: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; GCN: $vgpr0 = COPY [[INT]](s32)
+ ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:_(s32) = COPY $sgpr0
+ %2:_(s32) = G_FSQRT %0:_
+ %3:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %2:_(s32)
+ $vgpr0 = COPY %3:_(s32)
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name: sqrt_rcp_test
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GCN-LABEL: name: sqrt_rcp_test
+ ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; GCN: $vgpr0 = COPY [[INT]](s32)
+ ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:_(s32) = COPY $sgpr0
+ %2:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0:_(s32)
+ %3:_(s32) = G_FSQRT %2:_
+ $vgpr0 = COPY %3:_(s32)
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
More information about the llvm-commits
mailing list