[llvm] GlobalISel needs fdiv 1 / sqrt(x) to rsq combine (PR #78673)

Nick Anderson via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 30 00:39:41 PST 2024


https://github.com/nickleus27 updated https://github.com/llvm/llvm-project/pull/78673

>From 59a7d7d9347cb7ae13abe7886ae4575625213706 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Mon, 15 Jan 2024 02:38:21 -0800
Subject: [PATCH 1/7] GlobalISel needs fdiv 1 / sqrt(x) to rsq combine

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       | 11 ++-
 .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp    | 15 ++++
 .../GlobalISel/combine-fdiv-sqrt-to-rsq.mir   | 85 +++++++++++++++++++
 3 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b9411e2052120..acef73a1882b8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,6 +33,15 @@ def rcp_sqrt_to_rsq : GICombineRule<
          [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
 
+def fdiv_instr : GIDefMatchData<"MachineInstr *">;
+
+def fdiv_1_by_sqrt_to_rsq : GICombineRule<
+  (defs root:$dst, fdiv_instr:$fdivinstr),
+  (match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
+         // (G_FCONSTANT $one, $fpimm), // error: ('G_FCONSTANT') is unreachable from the pattern root!
+         (G_FDIV $dst, $sqrt, $fpimm, (MIFlags FmContract)):$fdivinstr,
+         [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0); }]),
+  (apply [{ applyOneFDivSqrtToRsq(*${fdivinstr}, ${x}.getReg()); }])>;
 
 def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
 
@@ -156,7 +165,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
   "AMDGPUPostLegalizerCombinerImpl",
   [all_combines, gfx6gfx7_combines, gfx8_combines,
    uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
-   rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
+   rcp_sqrt_to_rsq, fdiv_1_by_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index a1c34e92a57f3..650736c60d7ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,6 +83,8 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
   matchRcpSqrtToRsq(MachineInstr &MI,
                     std::function<void(MachineIRBuilder &)> &MatchInfo) const;
 
+  void applyOneFDivSqrtToRsq(MachineInstr &MI, const Register &X) const;
+
   // FIXME: Should be able to have 2 separate matchdatas rather than custom
   // struct boilerplate.
   struct CvtF32UByteMatchInfo {
@@ -334,6 +336,19 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
   return false;
 }
 
+void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(
+    MachineInstr &MI, const Register &X) const {
+  // B.setInstrAndDebugLoc(MI);
+
+  Register Dst = MI.getOperand(0).getReg();
+
+  B.buildIntrinsic(Intrinsic::amdgcn_rsq, ArrayRef<Register>({Dst}))
+      .addUse(X)
+      .setMIFlags(MI.getFlags());
+
+  MI.eraseFromParent();
+}
+
 bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
   Register SrcReg = MI.getOperand(1).getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
new file mode 100644
index 0000000000000..07479bc607ad8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -0,0 +1,85 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name:            rsq_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_multi_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...

>From e9acad2f6b4fd7a06d646279bde12808bee10ce6 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Tue, 23 Jan 2024 01:20:54 -0800
Subject: [PATCH 2/7] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index acef73a1882b8..e8158d202a3a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -38,8 +38,8 @@ def fdiv_instr : GIDefMatchData<"MachineInstr *">;
 def fdiv_1_by_sqrt_to_rsq : GICombineRule<
   (defs root:$dst, fdiv_instr:$fdivinstr),
   (match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
-         // (G_FCONSTANT $one, $fpimm), // error: ('G_FCONSTANT') is unreachable from the pattern root!
-         (G_FDIV $dst, $sqrt, $fpimm, (MIFlags FmContract)):$fdivinstr,
+         (G_FCONSTANT $one, $fpimm),
+         (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$fdivinstr,
          [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0); }]),
   (apply [{ applyOneFDivSqrtToRsq(*${fdivinstr}, ${x}.getReg()); }])>;
 

>From 43026b8013796983f90646e964404f7b4cc4f8b8 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Tue, 23 Jan 2024 21:07:46 -0800
Subject: [PATCH 3/7] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td               | 11 +++++------
 .../AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir    | 11 +++--------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index e8158d202a3a5..6e6a714ee7001 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,15 +33,14 @@ def rcp_sqrt_to_rsq : GICombineRule<
          [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
 
-def fdiv_instr : GIDefMatchData<"MachineInstr *">;
-
 def fdiv_1_by_sqrt_to_rsq : GICombineRule<
-  (defs root:$dst, fdiv_instr:$fdivinstr),
+  (defs root:$root),
   (match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
          (G_FCONSTANT $one, $fpimm),
-         (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$fdivinstr,
-         [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0); }]),
-  (apply [{ applyOneFDivSqrtToRsq(*${fdivinstr}, ${x}.getReg()); }])>;
+         (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$root,
+         [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0)
+         || ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
+  (apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
 
 def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 07479bc607ad8..134e69768303f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,9 +13,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
-    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
-    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
@@ -40,9 +38,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
-    ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
-    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
@@ -68,8 +64,7 @@ body:             |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
     ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
-    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
-    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)

>From 8683e840582a3a70ef7a8e52c7808408c5dbe73e Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Wed, 24 Jan 2024 00:50:09 -0800
Subject: [PATCH 4/7] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |  2 +-
 .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp    |  8 +-
 .../GlobalISel/combine-fdiv-sqrt-to-rsq.mir   | 98 ++++++++++++++++++-
 3 files changed, 97 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 6e6a714ee7001..5e388a80741dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -39,7 +39,7 @@ def fdiv_1_by_sqrt_to_rsq : GICombineRule<
          (G_FCONSTANT $one, $fpimm),
          (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$root,
          [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0)
-         || ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
+            || ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
   (apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
 
 def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 650736c60d7ce..ae4e657283ec0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -338,14 +338,8 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
 
 void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(
     MachineInstr &MI, const Register &X) const {
-  // B.setInstrAndDebugLoc(MI);
-
   Register Dst = MI.getOperand(0).getReg();
-
-  B.buildIntrinsic(Intrinsic::amdgcn_rsq, ArrayRef<Register>({Dst}))
-      .addUse(X)
-      .setMIFlags(MI.getFlags());
-
+  B.buildIntrinsic(Intrinsic::amdgcn_rsq, Dst).addUse(X);
   MI.eraseFromParent();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 134e69768303f..402b8b5495a23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,7 +13,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
@@ -38,7 +38,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
@@ -64,7 +64,7 @@ body:             |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
     ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
-    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
@@ -78,3 +78,95 @@ body:             |
     S_ENDPGM 0, implicit %sqrt
 
 ...
+
+---
+name:            rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %rsq:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract G_FDIV %one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            neg_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %rsq:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %neg_one:_(s32) = G_FCONSTANT float -1.0
+    %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %rsq:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract G_FSQRT %x
+    %one:_(s64) = G_FCONSTANT double 1.0
+    %rsq:_(s64) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %rsq:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract G_FSQRT %x
+    %neg_one:_(s64) = G_FCONSTANT double -1.0
+    %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...

>From a1b88a861d7376cfd4f38c7fa45c466f06129d49 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Fri, 26 Jan 2024 01:10:35 -0800
Subject: [PATCH 5/7] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |  3 ++-
 .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp    | 22 ++++++++++++++-----
 .../GlobalISel/combine-fdiv-sqrt-to-rsq.mir   | 17 ++++++++------
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 5e388a80741dd..65d6e66eb6162 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -40,7 +40,8 @@ def fdiv_1_by_sqrt_to_rsq : GICombineRule<
          (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$root,
          [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0)
             || ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
-  (apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
+  (apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg(),
+            ${fpimm}.getFPImm()->isExactlyValue(-1.0)); }])>;
 
 def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index ae4e657283ec0..ffeea47080e66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,7 +83,8 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
   matchRcpSqrtToRsq(MachineInstr &MI,
                     std::function<void(MachineIRBuilder &)> &MatchInfo) const;
 
-  void applyOneFDivSqrtToRsq(MachineInstr &MI, const Register &X) const;
+  void applyOneFDivSqrtToRsq(MachineInstr &MI, const Register &X,
+                             bool isNeg) const;
 
   // FIXME: Should be able to have 2 separate matchdatas rather than custom
   // struct boilerplate.
@@ -336,10 +337,21 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
   return false;
 }
 
-void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(
-    MachineInstr &MI, const Register &X) const {
-  Register Dst = MI.getOperand(0).getReg();
-  B.buildIntrinsic(Intrinsic::amdgcn_rsq, Dst).addUse(X);
+void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(MachineInstr &MI,
+                                                            const Register &X,
+                                                            bool isNeg) const {
+  auto Dst = MI.getOperand(0).getReg();
+  auto Flags = MI.getFlags();
+  if (isNeg) {
+    LLT DstTy = MRI.getType(Dst);
+    Register Src = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
+                       .addUse(X)
+                       .setMIFlags(Flags)
+                       .getReg(0);
+    B.buildFNeg(Dst, Src, Flags);
+  } else {
+    B.buildIntrinsic(Intrinsic::amdgcn_rsq, Dst).addUse(X).setMIFlags(Flags);
+  }
   MI.eraseFromParent();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 402b8b5495a23..82dd123bbb125 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,7 +13,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
@@ -38,7 +38,8 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FNEG [[INT]]
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
@@ -64,7 +65,7 @@ body:             |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
     ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
-    ; GCN-NEXT: %rsq:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
@@ -90,7 +91,7 @@ body:             |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %rsq:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: %rsq:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
     ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
     %x:_(s32) = COPY $vgpr0
     %sqrt:_(s32) = contract G_FSQRT %x
@@ -111,7 +112,8 @@ body:             |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %rsq:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FNEG [[INT]]
     ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
     %x:_(s32) = COPY $vgpr0
     %sqrt:_(s32) = contract G_FSQRT %x
@@ -133,7 +135,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
-    ; GCN-NEXT: %rsq:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+    ; GCN-NEXT: %rsq:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
     ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
@@ -158,7 +160,8 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
-    ; GCN-NEXT: %rsq:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FNEG [[INT]]
     ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0

>From d8b91820df010e30a4fda0a687eb12b5a282b42b Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Mon, 29 Jan 2024 23:23:34 -0800
Subject: [PATCH 6/7] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       | 12 ++---
 .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp    | 29 +++++-----
 .../GlobalISel/combine-fdiv-sqrt-to-rsq.mir   | 53 +++++++++++--------
 3 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 65d6e66eb6162..d6ada227ef51d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,15 +33,11 @@ def rcp_sqrt_to_rsq : GICombineRule<
          [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
 
-def fdiv_1_by_sqrt_to_rsq : GICombineRule<
+def fdiv_by_sqrt_to_rsq : GICombineRule<
   (defs root:$root),
   (match (G_FSQRT $sqrt, $x, (MIFlags FmContract)),
-         (G_FCONSTANT $one, $fpimm),
-         (G_FDIV $dst, $one, $sqrt, (MIFlags FmContract)):$root,
-         [{ return ${fpimm}.getFPImm()->isExactlyValue(1.0)
-            || ${fpimm}.getFPImm()->isExactlyValue(-1.0); }]),
-  (apply [{ applyOneFDivSqrtToRsq(*${root}, ${x}.getReg(),
-            ${fpimm}.getFPImm()->isExactlyValue(-1.0)); }])>;
+         (G_FDIV $dst, $y, $sqrt, (MIFlags FmContract)):$root),
+  (apply [{ applyFDivSqrtToRsq(*${root}, ${x}.getReg()); }])>;
 
 def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
 
@@ -165,7 +161,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
   "AMDGPUPostLegalizerCombinerImpl",
   [all_combines, gfx6gfx7_combines, gfx8_combines,
    uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
-   rcp_sqrt_to_rsq, fdiv_1_by_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
+   rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index ffeea47080e66..beec236b793fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,8 +83,7 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
   matchRcpSqrtToRsq(MachineInstr &MI,
                     std::function<void(MachineIRBuilder &)> &MatchInfo) const;
 
-  void applyOneFDivSqrtToRsq(MachineInstr &MI, const Register &X,
-                             bool isNeg) const;
+  void applyFDivSqrtToRsq(MachineInstr &MI, const Register &X) const;
 
   // FIXME: Should be able to have 2 separate matchdatas rather than custom
   // struct boilerplate.
@@ -337,21 +336,17 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
   return false;
 }
 
-void AMDGPUPostLegalizerCombinerImpl::applyOneFDivSqrtToRsq(MachineInstr &MI,
-                                                            const Register &X,
-                                                            bool isNeg) const {
-  auto Dst = MI.getOperand(0).getReg();
-  auto Flags = MI.getFlags();
-  if (isNeg) {
-    LLT DstTy = MRI.getType(Dst);
-    Register Src = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
-                       .addUse(X)
-                       .setMIFlags(Flags)
-                       .getReg(0);
-    B.buildFNeg(Dst, Src, Flags);
-  } else {
-    B.buildIntrinsic(Intrinsic::amdgcn_rsq, Dst).addUse(X).setMIFlags(Flags);
-  }
+void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsq(
+    MachineInstr &MI, const Register &X) const {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Y = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  uint32_t Flags = MI.getFlags();
+  Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
+                     .addUse(X)
+                     .setMIFlags(Flags)
+                     .getReg(0);
+  B.buildFMul(Dst, RSQ, Y, Flags);
   MI.eraseFromParent();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 82dd123bbb125..26e41a5cac2ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,14 +13,16 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %three:_(s16) = G_FCONSTANT half 0xH4200
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %three
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
     %x:_(s16) = G_TRUNC %0:_(s32)
     %sqrt:_(s16) = contract G_FSQRT %x
-    %one:_(s16) = G_FCONSTANT half 1.0
-    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %three:_(s16) = G_FCONSTANT half 3.0
+    %rsq:_(s16) = contract G_FDIV %three, %sqrt
     %ext:_(s32) = G_ANYEXT %rsq:_(s16)
     $vgpr0 = COPY %ext
 
@@ -38,15 +40,16 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %neg_three:_(s16) = G_FCONSTANT half 0xHC200
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
-    ; GCN-NEXT: %rsq:_(s16) = contract G_FNEG [[INT]]
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_three
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
     %x:_(s16) = G_TRUNC %0:_(s32)
     %sqrt:_(s16) = contract G_FSQRT %x
-    %neg_one:_(s16) = G_FCONSTANT half -1.0
-    %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    %neg_three:_(s16) = G_FCONSTANT half -3.0
+    %rsq:_(s16) = contract G_FDIV %neg_three, %sqrt
     %ext:_(s32) = G_ANYEXT %rsq:_(s16)
     $vgpr0 = COPY %ext
 
@@ -65,15 +68,17 @@ body:             |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
     ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
-    ; GCN-NEXT: %rsq:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %three:_(s16) = G_FCONSTANT half 0xH4200
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %three
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
     %0:_(s32) = COPY $vgpr0
     %x:_(s16) = G_TRUNC %0:_(s32)
     %sqrt:_(s16) = contract G_FSQRT %x
-    %one:_(s16) = G_FCONSTANT half 1.0
-    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %three:_(s16) = G_FCONSTANT half 3.0
+    %rsq:_(s16) = contract G_FDIV %three, %sqrt
     %ext:_(s32) = G_ANYEXT %rsq:_(s16)
     $vgpr0 = COPY %ext
     S_ENDPGM 0, implicit %sqrt
@@ -91,12 +96,14 @@ body:             |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %rsq:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: %three:_(s32) = G_FCONSTANT float 3.000000e+00
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %three
     ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
     %x:_(s32) = COPY $vgpr0
     %sqrt:_(s32) = contract G_FSQRT %x
-    %one:_(s32) = G_FCONSTANT float 1.0
-    %rsq:_(s32) = contract G_FDIV %one, %sqrt
+    %three:_(s32) = G_FCONSTANT float 3.0
+    %rsq:_(s32) = contract G_FDIV %three, %sqrt
     $vgpr0 = COPY %rsq
 
 ...
@@ -112,13 +119,14 @@ body:             |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %neg_three:_(s32) = G_FCONSTANT float -3.000000e+00
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
-    ; GCN-NEXT: %rsq:_(s32) = contract G_FNEG [[INT]]
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_three
     ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
     %x:_(s32) = COPY $vgpr0
     %sqrt:_(s32) = contract G_FSQRT %x
-    %neg_one:_(s32) = G_FCONSTANT float -1.0
-    %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+    %neg_three:_(s32) = G_FCONSTANT float -3.0
+    %rsq:_(s32) = contract G_FDIV %neg_three, %sqrt
     $vgpr0 = COPY %rsq
 
 ...
@@ -135,14 +143,16 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
-    ; GCN-NEXT: %rsq:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+    ; GCN-NEXT: %three:_(s64) = G_FCONSTANT double 3.000000e+00
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %three
     ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
     %x:_(s64) = G_ANYEXT %0:_(s32)
     %sqrt:_(s64) = contract G_FSQRT %x
-    %one:_(s64) = G_FCONSTANT double 1.0
-    %rsq:_(s64) = contract G_FDIV %one, %sqrt
+    %three:_(s64) = G_FCONSTANT double 3.0
+    %rsq:_(s64) = contract G_FDIV %three, %sqrt
     %ext:_(s32) = G_TRUNC %rsq:_(s64)
     $vgpr0 = COPY %ext
 
@@ -160,15 +170,16 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %neg_three:_(s64) = G_FCONSTANT double -3.000000e+00
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
-    ; GCN-NEXT: %rsq:_(s64) = contract G_FNEG [[INT]]
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %neg_three
     ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
     %x:_(s64) = G_ANYEXT %0:_(s32)
     %sqrt:_(s64) = contract G_FSQRT %x
-    %neg_one:_(s64) = G_FCONSTANT double -1.0
-    %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+    %neg_three:_(s64) = G_FCONSTANT double -3.0
+    %rsq:_(s64) = contract G_FDIV %neg_three, %sqrt
     %ext:_(s32) = G_TRUNC %rsq:_(s64)
     $vgpr0 = COPY %ext
 

>From 2b04ce543e04d9315a7376541d97c0767151b717 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Tue, 30 Jan 2024 00:38:14 -0800
Subject: [PATCH 7/7] fixup! GlobalISel needs fdiv 1 / sqrt(x) to rsq combine

---
 .../GlobalISel/combine-fdiv-sqrt-to-rsq.mir   | 148 ++++++++++++++----
 1 file changed, 116 insertions(+), 32 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
index 26e41a5cac2ce..fef71621378a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -13,16 +13,14 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GCN-NEXT: %three:_(s16) = G_FCONSTANT half 0xH4200
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
-    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %three
-    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT [[INT]](s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
     %x:_(s16) = G_TRUNC %0:_(s32)
     %sqrt:_(s16) = contract G_FSQRT %x
-    %three:_(s16) = G_FCONSTANT half 3.0
-    %rsq:_(s16) = contract G_FDIV %three, %sqrt
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
     %ext:_(s32) = G_ANYEXT %rsq:_(s16)
     $vgpr0 = COPY %ext
 
@@ -40,16 +38,16 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GCN-NEXT: %neg_three:_(s16) = G_FCONSTANT half 0xHC200
+    ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
-    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_three
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_one
     ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
     %x:_(s16) = G_TRUNC %0:_(s32)
     %sqrt:_(s16) = contract G_FSQRT %x
-    %neg_three:_(s16) = G_FCONSTANT half -3.0
-    %rsq:_(s16) = contract G_FDIV %neg_three, %sqrt
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
     %ext:_(s32) = G_ANYEXT %rsq:_(s16)
     $vgpr0 = COPY %ext
 
@@ -68,17 +66,15 @@ body:             |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
     ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
-    ; GCN-NEXT: %three:_(s16) = G_FCONSTANT half 0xH4200
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
-    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %three
-    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT [[INT]](s16)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
     %0:_(s32) = COPY $vgpr0
     %x:_(s16) = G_TRUNC %0:_(s32)
     %sqrt:_(s16) = contract G_FSQRT %x
-    %three:_(s16) = G_FCONSTANT half 3.0
-    %rsq:_(s16) = contract G_FDIV %three, %sqrt
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
     %ext:_(s32) = G_ANYEXT %rsq:_(s16)
     $vgpr0 = COPY %ext
     S_ENDPGM 0, implicit %sqrt
@@ -96,14 +92,12 @@ body:             |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %three:_(s32) = G_FCONSTANT float 3.000000e+00
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
-    ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %three
-    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    ; GCN-NEXT: $vgpr0 = COPY [[INT]](s32)
     %x:_(s32) = COPY $vgpr0
     %sqrt:_(s32) = contract G_FSQRT %x
-    %three:_(s32) = G_FCONSTANT float 3.0
-    %rsq:_(s32) = contract G_FDIV %three, %sqrt
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract G_FDIV %one, %sqrt
     $vgpr0 = COPY %rsq
 
 ...
@@ -119,14 +113,14 @@ body:             |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: %neg_three:_(s32) = G_FCONSTANT float -3.000000e+00
+    ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
-    ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_three
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_one
     ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
     %x:_(s32) = COPY $vgpr0
     %sqrt:_(s32) = contract G_FSQRT %x
-    %neg_three:_(s32) = G_FCONSTANT float -3.0
-    %rsq:_(s32) = contract G_FDIV %neg_three, %sqrt
+    %neg_one:_(s32) = G_FCONSTANT float -1.0
+    %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
     $vgpr0 = COPY %rsq
 
 ...
@@ -143,16 +137,14 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
-    ; GCN-NEXT: %three:_(s64) = G_FCONSTANT double 3.000000e+00
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
-    ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %three
-    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC [[INT]](s64)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
     %x:_(s64) = G_ANYEXT %0:_(s32)
     %sqrt:_(s64) = contract G_FSQRT %x
-    %three:_(s64) = G_FCONSTANT double 3.0
-    %rsq:_(s64) = contract G_FDIV %three, %sqrt
+    %one:_(s64) = G_FCONSTANT double 1.0
+    %rsq:_(s64) = contract G_FDIV %one, %sqrt
     %ext:_(s32) = G_TRUNC %rsq:_(s64)
     $vgpr0 = COPY %ext
 
@@ -170,17 +162,109 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
-    ; GCN-NEXT: %neg_three:_(s64) = G_FCONSTANT double -3.000000e+00
+    ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
     ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s64)
-    ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %neg_three
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FMUL [[INT]], %neg_one
     ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
     ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
     %0:_(s32) = COPY $vgpr0
     %x:_(s64) = G_ANYEXT %0:_(s32)
     %sqrt:_(s64) = contract G_FSQRT %x
-    %neg_three:_(s64) = G_FCONSTANT double -3.0
-    %rsq:_(s64) = contract G_FDIV %neg_three, %sqrt
+    %neg_one:_(s64) = G_FCONSTANT double -1.0
+    %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
     %ext:_(s32) = G_TRUNC %rsq:_(s64)
     $vgpr0 = COPY %ext
 
 ...
+
+---
+name:            rsq_fract_num_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_fract_num_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %fract:_(s32) = G_FCONSTANT float 5.000000e-01
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %fract
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %fract:_(s32) = G_FCONSTANT float 0.5
+    %rsq:_(s32) = contract G_FDIV %fract, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            neg_rsq_fract_num_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_fract_num_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %neg_fract:_(s32) = G_FCONSTANT float -5.000000e-01
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_fract
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %neg_fract:_(s32) = G_FCONSTANT float -0.5
+    %rsq:_(s32) = contract G_FDIV %neg_fract, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            rsq_large_num_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_large_num_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %ten:_(s32) = G_FCONSTANT float 1.000000e+01
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %ten
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %ten:_(s32) = G_FCONSTANT float 10.0
+    %rsq:_(s32) = contract G_FDIV %ten, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            neg_rsq_large_num_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_large_num_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %neg_ten:_(s32) = G_FCONSTANT float -1.000000e+01
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s32)
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FMUL [[INT]], %neg_ten
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %neg_ten:_(s32) = G_FCONSTANT float -10.0
+    %rsq:_(s32) = contract G_FDIV %neg_ten, %sqrt
+    $vgpr0 = COPY %rsq
+
+...



More information about the llvm-commits mailing list