[llvm] GlobalISel needs fdiv 1 / sqrt(x) to rsq combine (PR #78673)

Thu Feb 8 01:37:17 PST 2024

================
@@ -337,32 +337,24 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
   return false;
 }
 
-bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsq(
+bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
     MachineInstr &MI) const {
   Register Dst = MI.getOperand(0).getReg();
   Register Sqrt = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(Dst);
-  const MachineFunction &MF = B.getMF();
-  bool AllowInaccurateRsq =
-      MI.getFlag(MachineInstr::FmAfn) || MF.getTarget().Options.UnsafeFPMath;
   if (!MRI.hasOneUse(Sqrt)) {
     return false;
   }
-  // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
-  // the CI documentation has a worst case error of 1 ulp.
-  // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
-  // use it as long as we aren't trying to use denormals.
-  //
-  // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
-  if (!AllowInaccurateRsq && DstTy != LLT::scalar(16)) {
-    return false;
+  // f32/f64 rsq is handled in AMDGPUCodeGenPrepare
+  // only match if operand type is f16
+  // v_rsq_f16 supports denormals and 0.51ulp.
+  if (DstTy == LLT::scalar(16)) {
----------------
arsenm wrote:

can move the type check into the pattern 

https://github.com/llvm/llvm-project/pull/78673