[llvm] GlobalISel needs fdiv 1 / sqrt(x) to rsq combine (PR #78673)
Nick Anderson via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 8 22:53:20 PST 2024
https://github.com/nickleus27 updated https://github.com/llvm/llvm-project/pull/78673
>From 59a7d7d9347cb7ae13abe7886ae4575625213706 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Mon, 15 Jan 2024 02:38:21 -0800
Subject: [PATCH 01/11] GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 11 ++-
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 15 ++++
.../GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 85 +++++++++++++++++++
3 files changed, 110 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b9411e2052120d..acef73a1882b8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,6 +33,15 @@ def rcp_sqrt_to_rsq : GICombineRule<
[{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
+def fdiv_instr : GIDefMatchData<"MachineInstr *">;
+
+def fdiv_1_by_sqrt_to_rsq : GICombineRule<
+ (defs root:$dst, fdiv_instr:$fdivinstr),
+ (match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
+ // (G_FCONSTANT $one, $fpimm), // error: ('G_FCONSTANT') is unreachable from the pattern root!
+ (G_FDIV $dst, $sqrt, $fpimm, (MIFlags FmContract)):$fdivinstr,
+ [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0); }]),
+ (apply [{ applyOneFDivSqrtToRsq(*${fdivinstr}, ${x}.getReg()); }])>;
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
@@ -156,7 +165,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
- rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
+ rcp_sqrt_to_rsq, fdiv_1_by_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index a1c34e92a57f35..650736c60d7cea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,6 +83,8 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
matchRcpSqrtToRsq(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo) const;
+ void applyOneFDivSqrtToRsq(MachineInstr &MI, const Register &X) const;
+
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
struct CvtF32UByteMatchInfo {
@@ -334,6 +336,19 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
return false;
}
+void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(
+ MachineInstr &MI, const Register &X) const {
+ // B.setInstrAndDebugLoc(MI);
+
+ Register Dst = MI.getOperand(0).getReg();
+
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, ArrayRef<Register>({Dst}))
+ .addUse(X)
+ .setMIFlags(MI.getFlags());
+
+ MI.eraseFromParent();
+}
+
bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
Register SrcReg = MI.getOperand(1).getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
new file mode 100644
index 00000000000000..07479bc607ad8c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -0,0 +1,85 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: rsq_f16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: rsq_f16
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+ ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s16) = G_TRUNC %0:_(s32)
+ %sqrt:_(s16) = contract G_FSQRT %x
+ %one:_(s16) = G_FCONSTANT half 1.0
+ %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+ $vgpr0 = COPY %ext
+
+...
+
+---
+name: neg_rsq_f16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: neg_rsq_f16
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+ ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s16) = G_TRUNC %0:_(s32)
+ %sqrt:_(s16) = contract G_FSQRT %x
+ %neg_one:_(s16) = G_FCONSTANT half -1.0
+ %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+ %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+ $vgpr0 = COPY %ext
+
+...
+
+---
+name: rsq_f16_multi_use
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: rsq_f16_multi_use
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+ ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s16) = G_TRUNC %0:_(s32)
+ %sqrt:_(s16) = contract G_FSQRT %x
+ %one:_(s16) = G_FCONSTANT half 1.0
+ %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+ $vgpr0 = COPY %ext
+ S_ENDPGM 0, implicit %sqrt
+
+...
>From e9acad2f6b4fd7a06d646279bde12808bee10ce6 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Tue, 23 Jan 2024 01:20:54 -0800
Subject: [PATCH 02/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index acef73a1882b8f..e8158d202a3a53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -38,8 +38,8 @@ def fdiv_instr : GIDefMatchData<"MachineInstr *">;
def fdiv_1_by_sqrt_to_rsq : GICombineRule<
(defs root:$dst, fdiv_instr:$fdivinstr),
(match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
- // (G_FCONSTANT $one, $fpimm), // error: ('G_FCONSTANT') is unreachable from the pattern root!
- (G_FDIV $dst, $sqrt, $fpimm, (MIFlags FmContract)):$fdivinstr,
+ (G_FCONSTANT $one, $fpimm),
+ (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$fdivinstr,
[{ return ${fpimm}.getFPImm()->isExactlyValue(1.0); }]),
(apply [{ applyOneFDivSqrtToRsq(*${fdivinstr}, ${x}.getReg()); }])>;
>From 43026b8013796983f90646e964404f7b4cc4f8b8 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Tue, 23 Jan 2024 21:07:46 -0800
Subject: [PATCH 03/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 11 +++++------
.../AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 11 +++--------
2 files changed, 8 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index e8158d202a3a53..6e6a714ee70010 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,15 +33,14 @@ def rcp_sqrt_to_rsq : GICombineRule<
[{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
-def fdiv_instr : GIDefMatchData<"MachineInstr *">;
-
def fdiv_1_by_sqrt_to_rsq : GICombineRule<
- (defs root:$dst, fdiv_instr:$fdivinstr),
+ (defs root:$root),
(match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
(G_FCONSTANT $one, $fpimm),
- (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$fdivinstr,
- [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0); }]),
- (apply [{ applyOneFDivSqrtToRsq(*${fdivinstr}, ${x}.getReg()); }])>;
+ (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$root,
+ [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0)
+ || ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
+ (apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 07479bc607ad8c..134e69768303f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,9 +13,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
- ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
- ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
- ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
@@ -40,9 +38,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
- ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
- ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
- ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+ ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
@@ -68,8 +64,7 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
- ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
- ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
>From 8683e840582a3a70ef7a8e52c7808408c5dbe73e Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Wed, 24 Jan 2024 00:50:09 -0800
Subject: [PATCH 04/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 2 +-
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 8 +-
.../GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 98 ++++++++++++++++++-
3 files changed, 97 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 6e6a714ee70010..5e388a80741dd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -39,7 +39,7 @@ def fdiv_1_by_sqrt_to_rsq : GICombineRule<
(G_FCONSTANT $one, $fpimm),
(G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$root,
[{ return ${fpimm}.getFPImm()->isExactlyValue(1.0)
- || ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
+ || ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
(apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 650736c60d7cea..ae4e657283ec06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -338,14 +338,8 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(
MachineInstr &MI, const Register &X) const {
- // B.setInstrAndDebugLoc(MI);
-
Register Dst = MI.getOperand(0).getReg();
-
- B.buildIntrinsic(Intrinsic::amdgcn_rsq, ArrayRef<Register>({Dst}))
- .addUse(X)
- .setMIFlags(MI.getFlags());
-
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, Dst).addUse(X);
MI.eraseFromParent();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 134e69768303f6..402b8b5495a23b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,7 +13,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
- ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
@@ -38,7 +38,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
- ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
@@ -64,7 +64,7 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
- ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
@@ -78,3 +78,95 @@ body: |
S_ENDPGM 0, implicit %sqrt
...
+
+---
+name: rsq_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: rsq_f32
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %rsq:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+ %x:_(s32) = COPY $vgpr0
+ %sqrt:_(s32) = contract G_FSQRT %x
+ %one:_(s32) = G_FCONSTANT float 1.0
+ %rsq:_(s32) = contract G_FDIV %one, %sqrt
+ $vgpr0 = COPY %rsq
+
+...
+
+---
+name: neg_rsq_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: neg_rsq_f32
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %rsq:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+ %x:_(s32) = COPY $vgpr0
+ %sqrt:_(s32) = contract G_FSQRT %x
+ %neg_one:_(s32) = G_FCONSTANT float -1.0
+ %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+ $vgpr0 = COPY %rsq
+
+...
+
+---
+name: rsq_f64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: rsq_f64
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+ ; GCN-NEXT: %rsq:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+ ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s64) = G_ANYEXT %0:_(s32)
+ %sqrt:_(s64) = contract G_FSQRT %x
+ %one:_(s64) = G_FCONSTANT double 1.0
+ %rsq:_(s64) = contract G_FDIV %one, %sqrt
+ %ext:_(s32) = G_TRUNC %rsq:_(s64)
+ $vgpr0 = COPY %ext
+
+...
+
+---
+name: neg_rsq_f64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: neg_rsq_f64
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+ ; GCN-NEXT: %rsq:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+ ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s64) = G_ANYEXT %0:_(s32)
+ %sqrt:_(s64) = contract G_FSQRT %x
+ %neg_one:_(s64) = G_FCONSTANT double -1.0
+ %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+ %ext:_(s32) = G_TRUNC %rsq:_(s64)
+ $vgpr0 = COPY %ext
+
+...
>From a1b88a861d7376cfd4f38c7fa45c466f06129d49 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Fri, 26 Jan 2024 01:10:35 -0800
Subject: [PATCH 05/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 ++-
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 22 ++++++++++++++-----
.../GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 17 ++++++++------
3 files changed, 29 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 5e388a80741dd0..65d6e66eb6162c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -40,7 +40,8 @@ def fdiv_1_by_sqrt_to_rsq : GICombineRule<
(G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$root,
[{ return ${fpimm}.getFPImm()->isExactlyValue(1.0)
|| ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
- (apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
+ (apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg(),
+ ${fpimm}.getFPImm()->isExactlyValue(-1.0)); }])>;
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index ae4e657283ec06..ffeea47080e66d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,7 +83,8 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
matchRcpSqrtToRsq(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo) const;
- void applyOneFDivSqrtToRsq(MachineInstr &MI, const Register &X) const;
+ void applyOneFDivSqrtToRsq(MachineInstr &MI, const Register &X,
+ bool isNeg) const;
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
@@ -336,10 +337,21 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
return false;
}
-void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(
- MachineInstr &MI, const Register &X) const {
- Register Dst = MI.getOperand(0).getReg();
- B.buildIntrinsic(Intrinsic::amdgcn_rsq, Dst).addUse(X);
+void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(MachineInstr &MI,
+ const Register &X,
+ bool isNeg) const {
+ auto Dst = MI.getOperand(0).getReg();
+ auto Flags = MI.getFlags();
+ if (isNeg) {
+ LLT DstTy = MRI.getType(Dst);
+ Register Src = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
+ .addUse(X)
+ .setMIFlags(Flags)
+ .getReg(0);
+ B.buildFNeg(Dst, Src, Flags);
+ } else {
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, Dst).addUse(X).setMIFlags(Flags);
+ }
MI.eraseFromParent();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 402b8b5495a23b..82dd123bbb1251 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,7 +13,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
- ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
@@ -38,7 +38,8 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
- ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FNEG [[INT]]
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
@@ -64,7 +65,7 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
- ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
@@ -90,7 +91,7 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %rsq:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract G_FSQRT %x
@@ -111,7 +112,8 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %rsq:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FNEG [[INT]]
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract G_FSQRT %x
@@ -133,7 +135,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
- ; GCN-NEXT: %rsq:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+ ; GCN-NEXT: %rsq:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
@@ -158,7 +160,8 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
- ; GCN-NEXT: %rsq:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+ ; GCN-NEXT: %rsq:_(s64) = contract G_FNEG [[INT]]
; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
>From d8b91820df010e30a4fda0a687eb12b5a282b42b Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Mon, 29 Jan 2024 23:23:34 -0800
Subject: [PATCH 06/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 12 ++---
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 29 +++++-----
.../GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 53 +++++++++++--------
3 files changed, 48 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 65d6e66eb6162c..d6ada227ef51db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,15 +33,11 @@ def rcp_sqrt_to_rsq : GICombineRule<
[{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
-def fdiv_1_by_sqrt_to_rsq : GICombineRule<
+def fdiv_by_sqrt_to_rsq : GICombineRule<
(defs root:$root),
(match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
- (G_FCONSTANT $one, $fpimm),
- (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$root,
- [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0)
- || ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
- (apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg(),
- ${fpimm}.getFPImm()->isExactlyValue(-1.0)); }])>;
+ (G_FDIV $dst, $y, $sqrt, (MIFlags FmContract)):$root),
+ (apply [{ applyFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
@@ -165,7 +161,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
- rcp_sqrt_to_rsq, fdiv_1_by_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
+ rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index ffeea47080e66d..beec236b793fdf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,8 +83,7 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
matchRcpSqrtToRsq(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo) const;
- void applyOneFDivSqrtToRsq(MachineInstr &MI, const Register &X,
- bool isNeg) const;
+ void applyFDivSqrtToRsq(MachineInstr &MI, const Register &X) const;
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
@@ -337,21 +336,17 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
return false;
}
-void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(MachineInstr &MI,
- const Register &X,
- bool isNeg) const {
- auto Dst = MI.getOperand(0).getReg();
- auto Flags = MI.getFlags();
- if (isNeg) {
- LLT DstTy = MRI.getType(Dst);
- Register Src = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
- .addUse(X)
- .setMIFlags(Flags)
- .getReg(0);
- B.buildFNeg(Dst, Src, Flags);
- } else {
- B.buildIntrinsic(Intrinsic::amdgcn_rsq, Dst).addUse(X).setMIFlags(Flags);
- }
+void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsq(
+ MachineInstr &MI, const Register &X) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Y = MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ uint32_t Flags = MI.getFlags();
+ Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
+ .addUse(X)
+ .setMIFlags(Flags)
+ .getReg(0);
+ B.buildFMul(Dst, RSQ, Y, Flags);
MI.eraseFromParent();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 82dd123bbb1251..26e41a5cac2ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,14 +13,16 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
- ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %three:_(s16) = G_FCONSTANT half 0xH4200
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %three
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s16) = G_TRUNC %0:_(s32)
%sqrt:_(s16) = contract G_FSQRT %x
- %one:_(s16) = G_FCONSTANT half 1.0
- %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ %three:_(s16) = G_FCONSTANT half 3.0
+ %rsq:_(s16) = contract G_FDIV %three, %sqrt
%ext:_(s32) = G_ANYEXT %rsq:_(s16)
$vgpr0 = COPY %ext
@@ -38,15 +40,16 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %neg_three:_(s16) = G_FCONSTANT half 0xHC200
; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
- ; GCN-NEXT: %rsq:_(s16) = contract G_FNEG [[INT]]
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_three
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s16) = G_TRUNC %0:_(s32)
%sqrt:_(s16) = contract G_FSQRT %x
- %neg_one:_(s16) = G_FCONSTANT half -1.0
- %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+ %neg_three:_(s16) = G_FCONSTANT half -3.0
+ %rsq:_(s16) = contract G_FDIV %neg_three, %sqrt
%ext:_(s32) = G_ANYEXT %rsq:_(s16)
$vgpr0 = COPY %ext
@@ -65,15 +68,17 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
- ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %three:_(s16) = G_FCONSTANT half 0xH4200
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %three
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
%0:_(s32) = COPY $vgpr0
%x:_(s16) = G_TRUNC %0:_(s32)
%sqrt:_(s16) = contract G_FSQRT %x
- %one:_(s16) = G_FCONSTANT half 1.0
- %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ %three:_(s16) = G_FCONSTANT half 3.0
+ %rsq:_(s16) = contract G_FDIV %three, %sqrt
%ext:_(s32) = G_ANYEXT %rsq:_(s16)
$vgpr0 = COPY %ext
S_ENDPGM 0, implicit %sqrt
@@ -91,12 +96,14 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %rsq:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %three:_(s32) = G_FCONSTANT float 3.000000e+00
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %three
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract G_FSQRT %x
- %one:_(s32) = G_FCONSTANT float 1.0
- %rsq:_(s32) = contract G_FDIV %one, %sqrt
+ %three:_(s32) = G_FCONSTANT float 3.0
+ %rsq:_(s32) = contract G_FDIV %three, %sqrt
$vgpr0 = COPY %rsq
...
@@ -112,13 +119,14 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %neg_three:_(s32) = G_FCONSTANT float -3.000000e+00
; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract G_FNEG [[INT]]
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_three
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract G_FSQRT %x
- %neg_one:_(s32) = G_FCONSTANT float -1.0
- %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+ %neg_three:_(s32) = G_FCONSTANT float -3.0
+ %rsq:_(s32) = contract G_FDIV %neg_three, %sqrt
$vgpr0 = COPY %rsq
...
@@ -135,14 +143,16 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
- ; GCN-NEXT: %rsq:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+ ; GCN-NEXT: %three:_(s64) = G_FCONSTANT double 3.000000e+00
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+ ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %three
; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s64) = G_ANYEXT %0:_(s32)
%sqrt:_(s64) = contract G_FSQRT %x
- %one:_(s64) = G_FCONSTANT double 1.0
- %rsq:_(s64) = contract G_FDIV %one, %sqrt
+ %three:_(s64) = G_FCONSTANT double 3.0
+ %rsq:_(s64) = contract G_FDIV %three, %sqrt
%ext:_(s32) = G_TRUNC %rsq:_(s64)
$vgpr0 = COPY %ext
@@ -160,15 +170,16 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+ ; GCN-NEXT: %neg_three:_(s64) = G_FCONSTANT double -3.000000e+00
; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
- ; GCN-NEXT: %rsq:_(s64) = contract G_FNEG [[INT]]
+ ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %neg_three
; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s64) = G_ANYEXT %0:_(s32)
%sqrt:_(s64) = contract G_FSQRT %x
- %neg_one:_(s64) = G_FCONSTANT double -1.0
- %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+ %neg_three:_(s64) = G_FCONSTANT double -3.0
+ %rsq:_(s64) = contract G_FDIV %neg_three, %sqrt
%ext:_(s32) = G_TRUNC %rsq:_(s64)
$vgpr0 = COPY %ext
>From 2b04ce543e04d9315a7376541d97c0767151b717 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Tue, 30 Jan 2024 00:38:14 -0800
Subject: [PATCH 07/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
.../GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 148 ++++++++++++++----
1 file changed, 116 insertions(+), 32 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 26e41a5cac2ce7..fef71621378a6c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,16 +13,14 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
- ; GCN-NEXT: %three:_(s16) = G_FCONSTANT half 0xH4200
; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
- ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %three
- ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT [[INT]](s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s16) = G_TRUNC %0:_(s32)
%sqrt:_(s16) = contract G_FSQRT %x
- %three:_(s16) = G_FCONSTANT half 3.0
- %rsq:_(s16) = contract G_FDIV %three, %sqrt
+ %one:_(s16) = G_FCONSTANT half 1.0
+ %rsq:_(s16) = contract G_FDIV %one, %sqrt
%ext:_(s32) = G_ANYEXT %rsq:_(s16)
$vgpr0 = COPY %ext
@@ -40,16 +38,16 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
- ; GCN-NEXT: %neg_three:_(s16) = G_FCONSTANT half 0xHC200
+ ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
- ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_three
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_one
; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s16) = G_TRUNC %0:_(s32)
%sqrt:_(s16) = contract G_FSQRT %x
- %neg_three:_(s16) = G_FCONSTANT half -3.0
- %rsq:_(s16) = contract G_FDIV %neg_three, %sqrt
+ %neg_one:_(s16) = G_FCONSTANT half -1.0
+ %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
%ext:_(s32) = G_ANYEXT %rsq:_(s16)
$vgpr0 = COPY %ext
@@ -68,17 +66,15 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
- ; GCN-NEXT: %three:_(s16) = G_FCONSTANT half 0xH4200
; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
- ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %three
- ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT [[INT]](s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
%0:_(s32) = COPY $vgpr0
%x:_(s16) = G_TRUNC %0:_(s32)
%sqrt:_(s16) = contract G_FSQRT %x
- %three:_(s16) = G_FCONSTANT half 3.0
- %rsq:_(s16) = contract G_FDIV %three, %sqrt
+ %one:_(s16) = G_FCONSTANT half 1.0
+ %rsq:_(s16) = contract G_FDIV %one, %sqrt
%ext:_(s32) = G_ANYEXT %rsq:_(s16)
$vgpr0 = COPY %ext
S_ENDPGM 0, implicit %sqrt
@@ -96,14 +92,12 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %three:_(s32) = G_FCONSTANT float 3.000000e+00
; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %three
- ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[INT]](s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract G_FSQRT %x
- %three:_(s32) = G_FCONSTANT float 3.0
- %rsq:_(s32) = contract G_FDIV %three, %sqrt
+ %one:_(s32) = G_FCONSTANT float 1.0
+ %rsq:_(s32) = contract G_FDIV %one, %sqrt
$vgpr0 = COPY %rsq
...
@@ -119,14 +113,14 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %neg_three:_(s32) = G_FCONSTANT float -3.000000e+00
+ ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_three
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_one
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract G_FSQRT %x
- %neg_three:_(s32) = G_FCONSTANT float -3.0
- %rsq:_(s32) = contract G_FDIV %neg_three, %sqrt
+ %neg_one:_(s32) = G_FCONSTANT float -1.0
+ %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
$vgpr0 = COPY %rsq
...
@@ -143,16 +137,14 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
- ; GCN-NEXT: %three:_(s64) = G_FCONSTANT double 3.000000e+00
; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
- ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %three
- ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+ ; GCN-NEXT: %ext:_(s32) = G_TRUNC [[INT]](s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s64) = G_ANYEXT %0:_(s32)
%sqrt:_(s64) = contract G_FSQRT %x
- %three:_(s64) = G_FCONSTANT double 3.0
- %rsq:_(s64) = contract G_FDIV %three, %sqrt
+ %one:_(s64) = G_FCONSTANT double 1.0
+ %rsq:_(s64) = contract G_FDIV %one, %sqrt
%ext:_(s32) = G_TRUNC %rsq:_(s64)
$vgpr0 = COPY %ext
@@ -170,17 +162,109 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
- ; GCN-NEXT: %neg_three:_(s64) = G_FCONSTANT double -3.000000e+00
+ ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
- ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %neg_three
+ ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %neg_one
; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s64) = G_ANYEXT %0:_(s32)
%sqrt:_(s64) = contract G_FSQRT %x
- %neg_three:_(s64) = G_FCONSTANT double -3.0
- %rsq:_(s64) = contract G_FDIV %neg_three, %sqrt
+ %neg_one:_(s64) = G_FCONSTANT double -1.0
+ %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
%ext:_(s32) = G_TRUNC %rsq:_(s64)
$vgpr0 = COPY %ext
...
+
+---
+name: rsq_fract_num_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: rsq_fract_num_f32
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %fract:_(s32) = G_FCONSTANT float 5.000000e-01
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %fract
+ ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+ %x:_(s32) = COPY $vgpr0
+ %sqrt:_(s32) = contract G_FSQRT %x
+ %fract:_(s32) = G_FCONSTANT float 0.5
+ %rsq:_(s32) = contract G_FDIV %fract, %sqrt
+ $vgpr0 = COPY %rsq
+
+...
+
+---
+name: neg_rsq_fract_num_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: neg_rsq_fract_num_f32
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %neg_fract:_(s32) = G_FCONSTANT float -5.000000e-01
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_fract
+ ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+ %x:_(s32) = COPY $vgpr0
+ %sqrt:_(s32) = contract G_FSQRT %x
+ %neg_fract:_(s32) = G_FCONSTANT float -0.5
+ %rsq:_(s32) = contract G_FDIV %neg_fract, %sqrt
+ $vgpr0 = COPY %rsq
+
+...
+
+---
+name: rsq_large_num_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: rsq_large_num_f32
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %ten:_(s32) = G_FCONSTANT float 1.000000e+01
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %ten
+ ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+ %x:_(s32) = COPY $vgpr0
+ %sqrt:_(s32) = contract G_FSQRT %x
+ %ten:_(s32) = G_FCONSTANT float 10.0
+ %rsq:_(s32) = contract G_FDIV %ten, %sqrt
+ $vgpr0 = COPY %rsq
+
+...
+
+---
+name: neg_rsq_large_num_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: neg_rsq_large_num_f32
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %neg_ten:_(s32) = G_FCONSTANT float -1.000000e+01
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_ten
+ ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+ %x:_(s32) = COPY $vgpr0
+ %sqrt:_(s32) = contract G_FSQRT %x
+ %neg_ten:_(s32) = G_FCONSTANT float -10.0
+ %rsq:_(s32) = contract G_FDIV %neg_ten, %sqrt
+ $vgpr0 = COPY %rsq
+
+...
>From 1eab05e233927296ad86f9b36aaac609b656dc44 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Tue, 6 Feb 2024 01:06:02 -0800
Subject: [PATCH 08/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 +-
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 22 +++
.../GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 174 +++++++++++++++---
3 files changed, 173 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index d6ada227ef51db..9499a9fe469294 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -36,7 +36,8 @@ def rcp_sqrt_to_rsq : GICombineRule<
def fdiv_by_sqrt_to_rsq : GICombineRule<
(defs root:$root),
(match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
- (G_FDIV $dst, $y, $sqrt, (MIFlags FmContract)):$root),
+ (G_FDIV $dst, $y, $sqrt, (MIFlags FmContract)):$root,
+ [{ return matchFDivSqrtToRsq(*${root}); }]),
(apply [{ applyFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index beec236b793fdf..198cbd0685aa5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,6 +83,7 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
matchRcpSqrtToRsq(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo) const;
+ bool matchFDivSqrtToRsq(MachineInstr &MI) const;
void applyFDivSqrtToRsq(MachineInstr &MI, const Register &X) const;
// FIXME: Should be able to have 2 separate matchdatas rather than custom
@@ -336,6 +337,27 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
return false;
}
+bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsq(
+ MachineInstr &MI) const {
+ Register Dst = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ const MachineFunction &MF = B.getMF();
+ bool AllowInaccurateRsq =
+ MI.getFlag(MachineInstr::FmAfn) || MF.getTarget().Options.UnsafeFPMath;
+ // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+ // the CI documentation has a worst case error of 1 ulp.
+ // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+ // use it as long as we aren't trying to use denormals.
+ //
+ // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
+ if (!AllowInaccurateRsq && DstTy != LLT::scalar(16)) {
+ return false;
+ }
+ // What about v_rsq_f64? - Is UnsafeFPMath sufficient to do this for f64? The
+ // maximum ULP error seems really high at 2^29 ULP.
+ return true;
+}
+
void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsq(
MachineInstr &MI, const Register &X) const {
Register Dst = MI.getOperand(0).getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index fef71621378a6c..bf741301ca68f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -92,12 +92,12 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
; GCN-NEXT: $vgpr0 = COPY [[INT]](s32)
%x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract G_FSQRT %x
+ %sqrt:_(s32) = contract afn G_FSQRT %x
%one:_(s32) = G_FCONSTANT float 1.0
- %rsq:_(s32) = contract G_FDIV %one, %sqrt
+ %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
$vgpr0 = COPY %rsq
...
@@ -113,9 +113,9 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x
; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_one
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract G_FSQRT %x
@@ -125,6 +125,75 @@ body: |
...
+---
+name: afn_rsq_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: afn_rsq_f32
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[INT]](s32)
+ %x:_(s32) = COPY $vgpr0
+ %sqrt:_(s32) = contract afn G_FSQRT %x
+ %one:_(s32) = G_FCONSTANT float 1.0
+ %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+ $vgpr0 = COPY %rsq
+
+...
+
+---
+name: afn_rsq_f32_multi_use
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: afn_rsq_f32_multi_use
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %ret:_(s32) = G_FSUB %sqrt, [[INT]]
+ ; GCN-NEXT: $vgpr0 = COPY %ret(s32)
+ %x:_(s32) = COPY $vgpr0
+ %sqrt:_(s32) = contract afn G_FSQRT %x
+ %one:_(s32) = G_FCONSTANT float 1.0
+ %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+ %ret:_(s32) = G_FSUB %sqrt, %rsq
+ $vgpr0 = COPY %ret
+
+...
+
+---
+name: afn_neg_rsq_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: afn_neg_rsq_f32
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %neg_one
+ ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+ %x:_(s32) = COPY $vgpr0
+ %sqrt:_(s32) = contract afn G_FSQRT %x
+ %neg_one:_(s32) = G_FCONSTANT float -1.0
+ %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt
+ $vgpr0 = COPY %rsq
+
+...
+
+
---
name: rsq_f64
tracksRegLiveness: true
@@ -137,8 +206,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
- ; GCN-NEXT: %ext:_(s32) = G_TRUNC [[INT]](s64)
+ ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x
+ ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %one, %sqrt
+ ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s64) = G_ANYEXT %0:_(s32)
@@ -162,9 +233,9 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+ ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x
; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
- ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %neg_one
+ ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
@@ -177,6 +248,59 @@ body: |
...
+---
+name: afn_rsq_f64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: afn_rsq_f64
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+ ; GCN-NEXT: %ext:_(s32) = G_TRUNC [[INT]](s64)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s64) = G_ANYEXT %0:_(s32)
+ %sqrt:_(s64) = contract afn G_FSQRT %x
+ %one:_(s64) = G_FCONSTANT double 1.0
+ %rsq:_(s64) = contract afn G_FDIV %one, %sqrt
+ %ext:_(s32) = G_TRUNC %rsq:_(s64)
+ $vgpr0 = COPY %ext
+
+...
+
+---
+name: afn_neg_rsq_f64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; GCN-LABEL: name: afn_neg_rsq_f64
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+ ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+ ; GCN-NEXT: %rsq:_(s64) = contract afn G_FMUL [[INT]], %neg_one
+ ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s64) = G_ANYEXT %0:_(s32)
+ %sqrt:_(s64) = contract afn G_FSQRT %x
+ %neg_one:_(s64) = G_FCONSTANT double -1.0
+ %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt
+ %ext:_(s32) = G_TRUNC %rsq:_(s64)
+ $vgpr0 = COPY %ext
+
+...
+
+
---
name: rsq_fract_num_f32
tracksRegLiveness: true
@@ -189,13 +313,13 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
; GCN-NEXT: %fract:_(s32) = G_FCONSTANT float 5.000000e-01
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %fract
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %fract
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract G_FSQRT %x
+ %sqrt:_(s32) = contract afn G_FSQRT %x
%fract:_(s32) = G_FCONSTANT float 0.5
- %rsq:_(s32) = contract G_FDIV %fract, %sqrt
+ %rsq:_(s32) = contract afn G_FDIV %fract, %sqrt
$vgpr0 = COPY %rsq
...
@@ -212,13 +336,13 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
; GCN-NEXT: %neg_fract:_(s32) = G_FCONSTANT float -5.000000e-01
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_fract
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %neg_fract
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract G_FSQRT %x
+ %sqrt:_(s32) = contract afn G_FSQRT %x
%neg_fract:_(s32) = G_FCONSTANT float -0.5
- %rsq:_(s32) = contract G_FDIV %neg_fract, %sqrt
+ %rsq:_(s32) = contract afn G_FDIV %neg_fract, %sqrt
$vgpr0 = COPY %rsq
...
@@ -235,13 +359,13 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
; GCN-NEXT: %ten:_(s32) = G_FCONSTANT float 1.000000e+01
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %ten
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %ten
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract G_FSQRT %x
+ %sqrt:_(s32) = contract afn G_FSQRT %x
%ten:_(s32) = G_FCONSTANT float 10.0
- %rsq:_(s32) = contract G_FDIV %ten, %sqrt
+ %rsq:_(s32) = contract afn G_FDIV %ten, %sqrt
$vgpr0 = COPY %rsq
...
@@ -258,13 +382,13 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
; GCN-NEXT: %neg_ten:_(s32) = G_FCONSTANT float -1.000000e+01
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_ten
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+ ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %neg_ten
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract G_FSQRT %x
+ %sqrt:_(s32) = contract afn G_FSQRT %x
%neg_ten:_(s32) = G_FCONSTANT float -10.0
- %rsq:_(s32) = contract G_FDIV %neg_ten, %sqrt
+ %rsq:_(s32) = contract afn G_FDIV %neg_ten, %sqrt
$vgpr0 = COPY %rsq
...
>From 4e9fc327581f55507ddf1500b2e623057f1f7d9b Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Tue, 6 Feb 2024 01:40:24 -0800
Subject: [PATCH 09/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 4 ++++
.../AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 10 ++++++----
2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 198cbd0685aa5c..ecdf8463f15672 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -340,10 +340,14 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsq(
MachineInstr &MI) const {
Register Dst = MI.getOperand(0).getReg();
+ Register Sqrt = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(Dst);
const MachineFunction &MF = B.getMF();
bool AllowInaccurateRsq =
MI.getFlag(MachineInstr::FmAfn) || MF.getTarget().Options.UnsafeFPMath;
+ if (!MRI.hasOneUse(Sqrt)) {
+ return false;
+ }
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
// the CI documentation has a worst case error of 1 ulp.
// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index bf741301ca68f2..1c7ee8bb14e9b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -66,8 +66,9 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
- ; GCN-NEXT: %ext:_(s32) = G_ANYEXT [[INT]](s16)
+ ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
%0:_(s32) = COPY $vgpr0
@@ -158,8 +159,9 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %ret:_(s32) = G_FSUB %sqrt, [[INT]]
+ ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+ ; GCN-NEXT: %ret:_(s32) = G_FSUB %sqrt, %rsq
; GCN-NEXT: $vgpr0 = COPY %ret(s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract afn G_FSQRT %x
>From 09ed159cef3d01acfcc8692561bc114fa95c1cb2 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Wed, 7 Feb 2024 00:49:21 -0800
Subject: [PATCH 10/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 8 +-
.../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 28 ++--
.../GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 143 ++++++++++--------
3 files changed, 97 insertions(+), 82 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9499a9fe469294..fa606b52acabbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,12 +33,12 @@ def rcp_sqrt_to_rsq : GICombineRule<
[{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
-def fdiv_by_sqrt_to_rsq : GICombineRule<
+def fdiv_by_sqrt_to_rsq_f16 : GICombineRule<
(defs root:$root),
(match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
(G_FDIV $dst, $y, $sqrt, (MIFlags FmContract)):$root,
- [{ return matchFDivSqrtToRsq(*${root}); }]),
- (apply [{ applyFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
+ [{ return matchFDivSqrtToRsqF16(*${root}); }]),
+ (apply [{ applyFDivSqrtToRsqF16(*${root}, ${x}.getReg()); }])>;
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
@@ -162,7 +162,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
- rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
+ rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index ecdf8463f15672..20f3e1a29e6599 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,8 +83,8 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
matchRcpSqrtToRsq(MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo) const;
- bool matchFDivSqrtToRsq(MachineInstr &MI) const;
- void applyFDivSqrtToRsq(MachineInstr &MI, const Register &X) const;
+ bool matchFDivSqrtToRsqF16(MachineInstr &MI) const;
+ void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const;
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
@@ -337,32 +337,24 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
return false;
}
-bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsq(
+bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
MachineInstr &MI) const {
Register Dst = MI.getOperand(0).getReg();
Register Sqrt = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(Dst);
- const MachineFunction &MF = B.getMF();
- bool AllowInaccurateRsq =
- MI.getFlag(MachineInstr::FmAfn) || MF.getTarget().Options.UnsafeFPMath;
if (!MRI.hasOneUse(Sqrt)) {
return false;
}
- // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
- // the CI documentation has a worst case error of 1 ulp.
- // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
- // use it as long as we aren't trying to use denormals.
- //
- // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
- if (!AllowInaccurateRsq && DstTy != LLT::scalar(16)) {
- return false;
+ // f32/f64 rsq is handled in AMDGPUCodeGenPrepare
+ // only match if operand type is f16
+ // v_rsq_f16 supports denormals and 0.51ulp.
+ if (DstTy == LLT::scalar(16)) {
+ return true;
}
- // What about v_rsq_f64? - Is UnsafeFPMath sufficient to do this for f64? The
- // maximum ULP error seems really high at 2^29 ULP.
- return true;
+ return false;
}
-void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsq(
+void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
MachineInstr &MI, const Register &X) const {
Register Dst = MI.getOperand(0).getReg();
Register Y = MI.getOperand(1).getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 1c7ee8bb14e9b3..9508af3e7665d7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -93,12 +93,14 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: $vgpr0 = COPY [[INT]](s32)
+ ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x
+ ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %one, %sqrt
+ ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract afn G_FSQRT %x
+ %sqrt:_(s32) = contract G_FSQRT %x
%one:_(s32) = G_FCONSTANT float 1.0
- %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+ %rsq:_(s32) = contract G_FDIV %one, %sqrt
$vgpr0 = COPY %rsq
...
@@ -137,8 +139,10 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: $vgpr0 = COPY [[INT]](s32)
+ ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+ ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+ ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract afn G_FSQRT %x
%one:_(s32) = G_FCONSTANT float 1.0
@@ -183,9 +187,9 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %neg_one
+ ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt
; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
%x:_(s32) = COPY $vgpr0
%sqrt:_(s32) = contract afn G_FSQRT %x
@@ -262,8 +266,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
- ; GCN-NEXT: %ext:_(s32) = G_TRUNC [[INT]](s64)
+ ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x
+ ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00
+ ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %one, %sqrt
+ ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
%x:_(s64) = G_ANYEXT %0:_(s32)
@@ -287,9 +293,9 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+ ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x
; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
- ; GCN-NEXT: %rsq:_(s64) = contract afn G_FMUL [[INT]], %neg_one
+ ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt
; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
; GCN-NEXT: $vgpr0 = COPY %ext(s32)
%0:_(s32) = COPY $vgpr0
@@ -304,93 +310,110 @@ body: |
---
-name: rsq_fract_num_f32
+name: rsq_fract_num_f16
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
- ; GCN-LABEL: name: rsq_fract_num_f32
+ ; GCN-LABEL: name: rsq_fract_num_f16
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %fract:_(s32) = G_FCONSTANT float 5.000000e-01
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %fract
- ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
- %x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract afn G_FSQRT %x
- %fract:_(s32) = G_FCONSTANT float 0.5
- %rsq:_(s32) = contract afn G_FDIV %fract, %sqrt
- $vgpr0 = COPY %rsq
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %fract:_(s16) = G_FCONSTANT half 0xH3800
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %fract
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s16) = G_TRUNC %0:_(s32)
+ %sqrt:_(s16) = contract G_FSQRT %x
+ %fract:_(s16) = G_FCONSTANT half 0.5
+ %rsq:_(s16) = contract G_FDIV %fract, %sqrt
+ %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+ $vgpr0 = COPY %ext
...
---
-name: neg_rsq_fract_num_f32
+name: neg_rsq_fract_num_f16
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
- ; GCN-LABEL: name: neg_rsq_fract_num_f32
+ ; GCN-LABEL: name: neg_rsq_fract_num_f16
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %neg_fract:_(s32) = G_FCONSTANT float -5.000000e-01
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %neg_fract
- ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
- %x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract afn G_FSQRT %x
- %neg_fract:_(s32) = G_FCONSTANT float -0.5
- %rsq:_(s32) = contract afn G_FDIV %neg_fract, %sqrt
- $vgpr0 = COPY %rsq
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %neg_fract:_(s16) = G_FCONSTANT half 0xHB800
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_fract
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s16) = G_TRUNC %0:_(s32)
+ %sqrt:_(s16) = contract G_FSQRT %x
+ %neg_fract:_(s16) = G_FCONSTANT half -0.5
+ %rsq:_(s16) = contract G_FDIV %neg_fract, %sqrt
+ %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+ $vgpr0 = COPY %ext
+
...
---
-name: rsq_large_num_f32
+name: rsq_large_num_f16
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
- ; GCN-LABEL: name: rsq_large_num_f32
+ ; GCN-LABEL: name: rsq_large_num_f16
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %ten:_(s32) = G_FCONSTANT float 1.000000e+01
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %ten
- ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
- %x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract afn G_FSQRT %x
- %ten:_(s32) = G_FCONSTANT float 10.0
- %rsq:_(s32) = contract afn G_FDIV %ten, %sqrt
- $vgpr0 = COPY %rsq
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %ten:_(s16) = G_FCONSTANT half 0xH4900
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %ten
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s16) = G_TRUNC %0:_(s32)
+ %sqrt:_(s16) = contract G_FSQRT %x
+ %ten:_(s16) = G_FCONSTANT half 10.0
+ %rsq:_(s16) = contract G_FDIV %ten, %sqrt
+ %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+ $vgpr0 = COPY %ext
...
---
-name: neg_rsq_large_num_f32
+name: neg_rsq_large_num_f16
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
- ; GCN-LABEL: name: neg_rsq_large_num_f32
+ ; GCN-LABEL: name: neg_rsq_large_num_f16
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %neg_ten:_(s32) = G_FCONSTANT float -1.000000e+01
- ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
- ; GCN-NEXT: %rsq:_(s32) = contract afn G_FMUL [[INT]], %neg_ten
- ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
- %x:_(s32) = COPY $vgpr0
- %sqrt:_(s32) = contract afn G_FSQRT %x
- %neg_ten:_(s32) = G_FCONSTANT float -10.0
- %rsq:_(s32) = contract afn G_FDIV %neg_ten, %sqrt
- $vgpr0 = COPY %rsq
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %neg_ten:_(s16) = G_FCONSTANT half 0xHC900
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_ten
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ %0:_(s32) = COPY $vgpr0
+ %x:_(s16) = G_TRUNC %0:_(s32)
+ %sqrt:_(s16) = contract G_FSQRT %x
+ %neg_ten:_(s16) = G_FCONSTANT half -10.0
+ %rsq:_(s16) = contract G_FDIV %neg_ten, %sqrt
+ %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+ $vgpr0 = COPY %ext
...
>From 53a6cfcde459820963cbacaabe4f4e3d8bc5dffd Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Thu, 8 Feb 2024 22:52:59 -0800
Subject: [PATCH 11/11] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 4 ++--
.../Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 13 +------------
2 files changed, 3 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index fa606b52acabbd..9218760538dc5d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -35,8 +35,8 @@ def rcp_sqrt_to_rsq : GICombineRule<
def fdiv_by_sqrt_to_rsq_f16 : GICombineRule<
(defs root:$root),
- (match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
- (G_FDIV $dst, $y, $sqrt, (MIFlags FmContract)):$root,
+ (match (G_FSQRT f16:$sqrt, $x, (MIFlags FmContract)),
+ (G_FDIV f16:$dst, $y, $sqrt, (MIFlags FmContract)):$root,
[{ return matchFDivSqrtToRsqF16(*${root}); }]),
(apply [{ applyFDivSqrtToRsqF16(*${root}, ${x}.getReg()); }])>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 20f3e1a29e6599..82e17ddad851fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -339,19 +339,8 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
MachineInstr &MI) const {
- Register Dst = MI.getOperand(0).getReg();
Register Sqrt = MI.getOperand(2).getReg();
- LLT DstTy = MRI.getType(Dst);
- if (!MRI.hasOneUse(Sqrt)) {
- return false;
- }
- // f32/f64 rsq is handled in AMDGPUCodeGenPrepare
- // only match if operand type is f16
- // v_rsq_f16 supports denormals and 0.51ulp.
- if (DstTy == LLT::scalar(16)) {
- return true;
- }
- return false;
+ return MRI.hasOneNonDBGUse(Sqrt);
}
void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
More information about the llvm-commits
mailing list