[llvm] [Scalarizer][DirectX] Add support for scalarization of Target intrinsics (PR #108776)

Sun Sep 15 12:00:25 PDT 2024

https://github.com/farzonl created https://github.com/llvm/llvm-project/pull/108776

Since we are using the Scalarizer pass in the backend we needed a way to allow this pass to operate on Target intrinsics.
We achieved this by add `TargetTransformInfo ` to the Scalarizer pass. This allowed us to call a function available to the DirectX backend to know if an intrinsic is a target intrinsic that should be scalarized.


>From 415244c90e009fd3899940c9ce335fb717b6eac9 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzon at farzon.org>
Date: Fri, 13 Sep 2024 14:21:18 -0400
Subject: [PATCH] [DirectX] Add support for scalarization of Target  intrinsics

---
 .../llvm/Analysis/TargetTransformInfo.h       |  7 ++-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 +
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  4 ++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |  4 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 ++
 llvm/lib/Target/DirectX/CMakeLists.txt        |  1 +
 .../DirectX/DirectXTargetTransformInfo.cpp    | 23 +++++++++
 .../DirectX/DirectXTargetTransformInfo.h      |  1 +
 llvm/lib/Transforms/Scalar/Scalarizer.cpp     | 25 +++++++---
 llvm/test/CodeGen/DirectX/frac.ll             | 48 +++++++++++--------
 llvm/test/CodeGen/DirectX/rsqrt.ll            | 36 +++++++++-----
 11 files changed, 114 insertions(+), 41 deletions(-)
 create mode 100644 llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b2124c6106198e..f47221ebfc6001 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -882,6 +882,8 @@ class TargetTransformInfo {
   ///  should use coldcc calling convention.
   bool useColdCCForColdCall(Function &F) const;
 
+  bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const;
+
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
@@ -1928,6 +1930,7 @@ class TargetTransformInfo::Concept {
   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
   virtual bool shouldBuildRelLookupTables() = 0;
   virtual bool useColdCCForColdCall(Function &F) = 0;
+  virtual bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) = 0;  
   virtual InstructionCost getScalarizationOverhead(VectorType *Ty,
                                                    const APInt &DemandedElts,
                                                    bool Insert, bool Extract,
@@ -2467,7 +2470,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   bool useColdCCForColdCall(Function &F) override {
     return Impl.useColdCCForColdCall(F);
   }
-
+  bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) override {
+    return Impl.isTargetIntrinsicTriviallyScalarizable(ID);
+  }
   InstructionCost getScalarizationOverhead(VectorType *Ty,
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 90eef93a2a54d5..757a63d4855937 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -373,6 +373,8 @@ class TargetTransformInfoImplBase {
 
   bool useColdCCForColdCall(Function &F) const { return false; }
 
+  bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const { return false; }
+  
   InstructionCost getScalarizationOverhead(VectorType *Ty,
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 50dc7d5c54c54a..2d82973df30cb8 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -788,6 +788,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
     return Cost;
   }
+  
+  bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const {
+    return false;
+  }
 
   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index bacbbe0e69c9f2..08f78761c83b7c 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -72,7 +72,7 @@ def int_dx_udot :
     [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
     [IntrNoMem, Commutative] >;
 
-def int_dx_frac  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+def int_dx_frac  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 
 def int_dx_isinf :
     DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
@@ -85,7 +85,7 @@ def int_dx_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyf
 def int_dx_imad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 def int_dx_umad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 def int_dx_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]>;
-def int_dx_rsqrt  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+def int_dx_rsqrt  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
 def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty]>;
 def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>]>;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2c26493bd3f1ca..8736d63c2648c9 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -587,6 +587,10 @@ bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
   return TTIImpl->useColdCCForColdCall(F);
 }
 
+bool TargetTransformInfo::isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const {
+  return TTIImpl->isTargetIntrinsicTriviallyScalarizable(ID);
+}
+
 InstructionCost TargetTransformInfo::getScalarizationOverhead(
     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
     TTI::TargetCostKind CostKind) const {
diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index f7ae09957996b5..a9c5d81391b8d7 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -18,6 +18,7 @@ add_llvm_target(DirectXCodeGen
   DirectXRegisterInfo.cpp
   DirectXSubtarget.cpp
   DirectXTargetMachine.cpp
+  DirectXTargetTransformInfo.cpp
   DXContainerGlobals.cpp
   DXILFinalizeLinkage.cpp
   DXILIntrinsicExpansion.cpp
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
new file mode 100644
index 00000000000000..8ff8f49270bdd6
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -0,0 +1,23 @@
+//===- DirectXTargetTransformInfo.cpp - DirectX TTI ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+//===----------------------------------------------------------------------===//
+
+#include "DirectXTargetTransformInfo.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
+
+ bool llvm::DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const {
+   switch (ID) {
+    case Intrinsic::dx_frac:
+    case Intrinsic::dx_rsqrt:
+     return true;
+ default:
+     return false;
+   }
+}
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
index ed98355fad002d..3d74754ff57f6c 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
@@ -34,6 +34,7 @@ class DirectXTTIImpl : public BasicTTIImplBase<DirectXTTIImpl> {
       : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
   unsigned getMinVectorRegisterBitWidth() const { return 32; }
+  bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID)  const;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 01d24335df2262..44ef7639e8d2bc 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -32,6 +33,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -281,8 +283,8 @@ T getWithDefaultOverride(const cl::opt<T> &ClOption,
 
 class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
 public:
-  ScalarizerVisitor(DominatorTree *DT, ScalarizerPassOptions Options)
-      : DT(DT), ScalarizeVariableInsertExtract(getWithDefaultOverride(
+  ScalarizerVisitor(DominatorTree *DT, const TargetTransformInfo *TTI, ScalarizerPassOptions Options)
+      : DT(DT), TTI(TTI), ScalarizeVariableInsertExtract(getWithDefaultOverride(
                     ClScalarizeVariableInsertExtract,
                     Options.ScalarizeVariableInsertExtract)),
         ScalarizeLoadStore(getWithDefaultOverride(ClScalarizeLoadStore,
@@ -292,6 +294,8 @@ class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
 
   bool visit(Function &F);
 
+  bool isTriviallyScalarizable(Intrinsic::ID ID);
+
   // InstVisitor methods.  They return true if the instruction was scalarized,
   // false if nothing changed.
   bool visitInstruction(Instruction &I) { return false; }
@@ -335,6 +339,8 @@ class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
   SmallVector<WeakTrackingVH, 32> PotentiallyDeadInstrs;
 
   DominatorTree *DT;
+  const TargetTransformInfo *TTI;
+
 
   const bool ScalarizeVariableInsertExtract;
   const bool ScalarizeLoadStore;
@@ -358,6 +364,7 @@ ScalarizerLegacyPass::ScalarizerLegacyPass(const ScalarizerPassOptions &Options)
 
 void ScalarizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
 }
 
@@ -445,7 +452,8 @@ bool ScalarizerLegacyPass::runOnFunction(Function &F) {
     return false;
 
   DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  ScalarizerVisitor Impl(DT, Options);
+  const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  ScalarizerVisitor Impl(DT, TTI, Options);
   return Impl.visit(F);
 }
 
@@ -689,8 +697,9 @@ bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) {
   return true;
 }
 
-static bool isTriviallyScalariable(Intrinsic::ID ID) {
-  return isTriviallyVectorizable(ID);
+bool ScalarizerVisitor::isTriviallyScalarizable(Intrinsic::ID ID) {
+
+  return TTI->isTargetIntrinsicTriviallyScalarizable(ID) || isTriviallyVectorizable(ID);
 }
 
 /// If a call to a vector typed intrinsic function, split into a scalar call per
@@ -705,7 +714,8 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
     return false;
 
   Intrinsic::ID ID = F->getIntrinsicID();
-  if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID))
+  
+  if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID))
     return false;
 
   // unsigned NumElems = VT->getNumElements();
@@ -1249,7 +1259,8 @@ bool ScalarizerVisitor::finish() {
 
 PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) {
   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-  ScalarizerVisitor Impl(DT, Options);
+  const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
+  ScalarizerVisitor Impl(DT, TTI, Options);
   bool Changed = Impl.visit(F);
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
diff --git a/llvm/test/CodeGen/DirectX/frac.ll b/llvm/test/CodeGen/DirectX/frac.ll
index ae86fe06654da1..7a2c37b9a477f7 100644
--- a/llvm/test/CodeGen/DirectX/frac.ll
+++ b/llvm/test/CodeGen/DirectX/frac.ll
@@ -1,31 +1,39 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for frac are generated for float and half.
-; CHECK:call float @dx.op.unary.f32(i32 22, float %{{.*}})
-; CHECK:call half @dx.op.unary.f16(i32 22, half %{{.*}})
 
-target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-pc-shadermodel6.7-library"
+define noundef half @frac_half(half noundef %a) {
+entry:
+  ; CHECK:call half @dx.op.unary.f16(i32 22, half %{{.*}})
+  %dx.frac = call half @llvm.dx.frac.f16(half %a)
+  ret half %dx.frac
+}
 
-; Function Attrs: noinline nounwind optnone
 define noundef float @frac_float(float noundef %a) #0 {
 entry:
-  %a.addr = alloca float, align 4
-  store float %a, ptr %a.addr, align 4
-  %0 = load float, ptr %a.addr, align 4
-  %dx.frac = call float @llvm.dx.frac.f32(float %0)
+  ; CHECK:call float @dx.op.unary.f32(i32 22, float %{{.*}})
+  %dx.frac = call float @llvm.dx.frac.f32(float %a)
   ret float %dx.frac
 }
 
-; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
-declare float @llvm.dx.frac.f32(float) #1
-
-; Function Attrs: noinline nounwind optnone
-define noundef half @frac_half(half noundef %a) #0 {
+define noundef <4 x float> @frac_float4(<4 x float> noundef %a) #0 {
 entry:
-  %a.addr = alloca half, align 2
-  store half %a, ptr %a.addr, align 2
-  %0 = load half, ptr %a.addr, align 2
-  %dx.frac = call half @llvm.dx.frac.f16(half %0)
-  ret half %dx.frac
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 22, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 22, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 22, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 22, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.dx.frac.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
 }
+
+declare half  @llvm.dx.frac.f16(half)
+declare float @llvm.dx.frac.f32(float)
+declare <4 x float> @llvm.dx.frac.v4f32(<4 x float>)
\ No newline at end of file
diff --git a/llvm/test/CodeGen/DirectX/rsqrt.ll b/llvm/test/CodeGen/DirectX/rsqrt.ll
index 054c84483ef826..d7d85d377b127b 100644
--- a/llvm/test/CodeGen/DirectX/rsqrt.ll
+++ b/llvm/test/CodeGen/DirectX/rsqrt.ll
@@ -1,28 +1,42 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for rsqrt are generated for float and half.
 
 ; CHECK-LABEL: rsqrt_float
-; CHECK: call float @dx.op.unary.f32(i32 25, float %{{.*}})
 define noundef float @rsqrt_float(float noundef %a) {
 entry:
-  %a.addr = alloca float, align 4
-  store float %a, ptr %a.addr, align 4
-  %0 = load float, ptr %a.addr, align 4
-  %dx.rsqrt = call float @llvm.dx.rsqrt.f32(float %0)
+; CHECK: call float @dx.op.unary.f32(i32 25, float %{{.*}})
+  %dx.rsqrt = call float @llvm.dx.rsqrt.f32(float %a)
   ret float %dx.rsqrt
 }
 
 ; CHECK-LABEL: rsqrt_half
-; CHECK: call half @dx.op.unary.f16(i32 25, half %{{.*}})
 define noundef half @rsqrt_half(half noundef %a) {
 entry:
-  %a.addr = alloca half, align 2
-  store half %a, ptr %a.addr, align 2
-  %0 = load half, ptr %a.addr, align 2
-  %dx.rsqrt = call half @llvm.dx.rsqrt.f16(half %0)
+; CHECK: call half @dx.op.unary.f16(i32 25, half %{{.*}})
+  %dx.rsqrt = call half @llvm.dx.rsqrt.f16(half %a)
   ret half %dx.rsqrt
 }
 
+define noundef <4 x float> @rsqrt_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 25, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 25, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 25, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 25, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.dx.rsqrt.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
+
 declare half @llvm.dx.rsqrt.f16(half)
 declare float @llvm.dx.rsqrt.f32(float)
+declare <4 x float> @llvm.dx.rsqrt.v4f32(<4 x float>)