[llvm] c05e29b - [LegacyPM][DirectX] Add legacy scalarizer back for use in the DirectX backend (#107427)

Thu Sep 12 12:53:53 PDT 2024

Author: Farzon Lotfi
Date: 2024-09-12T15:53:50-04:00
New Revision: c05e29bff036060f0811b887a92715104abdceb5

URL: https://github.com/llvm/llvm-project/commit/c05e29bff036060f0811b887a92715104abdceb5
DIFF: https://github.com/llvm/llvm-project/commit/c05e29bff036060f0811b887a92715104abdceb5.diff

LOG: [LegacyPM][DirectX] Add legacy scalarizer back for use in the DirectX backend (#107427)

As discussed in this
[proposal](https://github.com/llvm/wg-hlsl/pull/62/files?short_path=ac6e592#diff-ac6e59276afe8016e307eedc5c835f534c0cb353707760b44df0fa9d905a5cf8).
We had to bring back the legacy pass manager interface for the
scalarizer pass. Two reasons for this:
1. The DirectX backend is still using the legacy pass manager
2. The new PM isn't hooked up in clang yet via `BackendUtil.cpp`'s
`AddEmitPasses` That means even if we add a `buildCodeGenPipeline` we
won't be able to benefit from the new pass manager's scalarizer pass
interface.

The remaining changes are hooking up the scalarizer pass to the DirectX
backend, updating the DirectX test cases,
and allowing the `optdriver` to not block the legacy invocation of the
scalarizer pass.

Future work still needs to be done to allow the scalarizer pass to
handle target specific intrinsics.

closes #105178

Added: 
    llvm/test/CodeGen/DirectX/llc-pipeline.ll
    llvm/test/CodeGen/DirectX/scalar-store.ll
    llvm/test/CodeGen/DirectX/scalarize-two-calls.ll

Modified: 
    llvm/include/llvm/InitializePasses.h
    llvm/include/llvm/LinkAllPasses.h
    llvm/include/llvm/Transforms/Scalar/Scalarizer.h
    llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
    llvm/lib/Transforms/Scalar/Scalar.cpp
    llvm/lib/Transforms/Scalar/Scalarizer.cpp
    llvm/test/CodeGen/DirectX/acos.ll
    llvm/test/CodeGen/DirectX/asin.ll
    llvm/test/CodeGen/DirectX/atan.ll
    llvm/test/CodeGen/DirectX/ceil.ll
    llvm/test/CodeGen/DirectX/cos.ll
    llvm/test/CodeGen/DirectX/cosh.ll
    llvm/test/CodeGen/DirectX/exp2.ll
    llvm/test/CodeGen/DirectX/fabs.ll
    llvm/test/CodeGen/DirectX/floor.ll
    llvm/test/CodeGen/DirectX/isinf.ll
    llvm/test/CodeGen/DirectX/reversebits.ll
    llvm/test/CodeGen/DirectX/round.ll
    llvm/test/CodeGen/DirectX/saturate.ll
    llvm/test/CodeGen/DirectX/sin.ll
    llvm/test/CodeGen/DirectX/sinh.ll
    llvm/test/CodeGen/DirectX/sqrt.ll
    llvm/test/CodeGen/DirectX/tan.ll
    llvm/test/CodeGen/DirectX/tanh.ll
    llvm/test/CodeGen/DirectX/trunc.ll
    llvm/tools/opt/optdriver.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 6605c6fde92510..4352099d6dbb99 100644

--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -276,6 +276,7 @@ void initializeSafepointIRVerifierPass(PassRegistry &);
 void initializeSelectOptimizePass(PassRegistry &);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry &);
 void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
+void initializeScalarizerLegacyPassPass(PassRegistry &);
 void initializeScavengerTestPass(PassRegistry &);
 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry &);
 void initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);

diff  --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 1da02153d846f1..92b59a66567c95 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -130,6 +130,7 @@ struct ForcePassLinking {
     (void)llvm::createLowerAtomicPass();
     (void)llvm::createLoadStoreVectorizerPass();
     (void)llvm::createPartiallyInlineLibCallsPass();
+    (void)llvm::createScalarizerPass();
     (void)llvm::createSeparateConstOffsetFromGEPPass();
     (void)llvm::createSpeculativeExecutionPass();
     (void)llvm::createSpeculativeExecutionIfHasBranchDivergencePass();

diff  --git a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
index 45e25cbf282149..4d2a1a2f889a3c 100644
--- a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
@@ -24,6 +24,7 @@
 namespace llvm {
 
 class Function;
+class FunctionPass;
 
 struct ScalarizerPassOptions {
   // These options correspond 1:1 to cl::opt options defined in
@@ -50,6 +51,10 @@ class ScalarizerPass : public PassInfoMixin<ScalarizerPass> {
   void setScalarizeLoadStore(bool Value) { Options.ScalarizeLoadStore = Value; }
   void setScalarizeMinBits(unsigned Value) { Options.ScalarizeMinBits = Value; }
 };
+
+/// Create a legacy pass manager instance of the Scalarizer pass
+FunctionPass *createScalarizerPass(
+    const ScalarizerPassOptions &Options = ScalarizerPassOptions());
 }
 
 #endif /* LLVM_TRANSFORMS_SCALAR_SCALARIZER_H */

diff  --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index a29fc210421637..606022a9835f04 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCSectionDXContainer.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -36,6 +37,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
 #include <optional>
 
 using namespace llvm;
@@ -44,6 +46,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
   auto *PR = PassRegistry::getPassRegistry();
   initializeDXILIntrinsicExpansionLegacyPass(*PR);
+  initializeScalarizerLegacyPassPass(*PR);
   initializeDXILPrepareModulePass(*PR);
   initializeEmbedDXILPassPass(*PR);
   initializeWriteDXILPassPass(*PR);
@@ -83,6 +86,9 @@ class DirectXPassConfig : public TargetPassConfig {
   FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
   void addCodeGenPrepare() override {
     addPass(createDXILIntrinsicExpansionLegacyPass());
+    ScalarizerPassOptions DxilScalarOptions;
+    DxilScalarOptions.ScalarizeLoadStore = true;
+    addPass(createScalarizerPass(DxilScalarOptions));
     addPass(createDXILOpLoweringLegacyPass());
     addPass(createDXILFinalizeLinkageLegacyPass());
     addPass(createDXILTranslateMetadataLegacyPass());

diff  --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index 7aeee1d31f7e79..fa6e671830d962 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -21,6 +21,7 @@ using namespace llvm;
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeConstantHoistingLegacyPassPass(Registry);
   initializeDCELegacyPassPass(Registry);
+  initializeScalarizerLegacyPassPass(Registry);
   initializeGVNLegacyPassPass(Registry);
   initializeEarlyCSELegacyPassPass(Registry);
   initializeEarlyCSEMemSSALegacyPassPass(Registry);

diff  --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 2bed3480da1cda..01d24335df2262 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -340,8 +341,33 @@ class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
   const unsigned ScalarizeMinBits;
 };
 
+class ScalarizerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+  ScalarizerPassOptions Options;
+  ScalarizerLegacyPass() : FunctionPass(ID), Options() {}
+  ScalarizerLegacyPass(const ScalarizerPassOptions &Options);
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
 } // end anonymous namespace
 
+ScalarizerLegacyPass::ScalarizerLegacyPass(const ScalarizerPassOptions &Options)
+    : FunctionPass(ID), Options(Options) {}
+
+void ScalarizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+char ScalarizerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer",
+                      "Scalarize vector operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer",
+                    "Scalarize vector operations", false, false)
+
 Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
                      const VectorSplit &VS, ValueVector *cachePtr)
     : BB(bb), BBI(bbi), V(v), VS(VS), CachePtr(cachePtr) {
@@ -414,6 +440,19 @@ Value *Scatterer::operator[](unsigned Frag) {
   return CV[Frag];
 }
 
+bool ScalarizerLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarizerVisitor Impl(DT, Options);
+  return Impl.visit(F);
+}
+
+FunctionPass *llvm::createScalarizerPass(const ScalarizerPassOptions &Options) {
+  return new ScalarizerLegacyPass(Options);
+}
+
 bool ScalarizerVisitor::visit(Function &F) {
   assert(Gathered.empty() && Scattered.empty());
 

diff  --git a/llvm/test/CodeGen/DirectX/acos.ll b/llvm/test/CodeGen/DirectX/acos.ll
index cc32182395627c..f4a10eb368ebfb 100644
--- a/llvm/test/CodeGen/DirectX/acos.ll
+++ b/llvm/test/CodeGen/DirectX/acos.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for acos are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @acos_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 15, float %{{.*}})
   %elt.acos = call float @llvm.acos.f32(float %a)
   ret float %elt.acos
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @acos_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 15, half %{{.*}})
   %elt.acos = call half @llvm.acos.f16(half %a)
   ret half %elt.acos
 }
 
+define noundef <4 x float> @acos_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 15, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 15, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 15, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 15, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.acos.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.acos.f16(half)
 declare float @llvm.acos.f32(float)
+declare <4 x float> @llvm.acos.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/asin.ll b/llvm/test/CodeGen/DirectX/asin.ll
index 06e3bab545a6aa..bd948f593c24e2 100644
--- a/llvm/test/CodeGen/DirectX/asin.ll
+++ b/llvm/test/CodeGen/DirectX/asin.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for asin are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @asin_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 16, float %{{.*}})
   %elt.asin = call float @llvm.asin.f32(float %a)
   ret float %elt.asin
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @asin_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 16, half %{{.*}})
   %elt.asin = call half @llvm.asin.f16(half %a)
   ret half %elt.asin
 }
 
+define noundef <4 x float> @asin_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 16, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 16, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 16, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 16, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.asin.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.asin.f16(half)
 declare float @llvm.asin.f32(float)
+declare <4 x float> @llvm.asin.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/atan.ll b/llvm/test/CodeGen/DirectX/atan.ll
index d7c4cd00e286a0..58899ab49bdb8e 100644
--- a/llvm/test/CodeGen/DirectX/atan.ll
+++ b/llvm/test/CodeGen/DirectX/atan.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for atan are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @atan_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 17, float %{{.*}})
   %elt.atan = call float @llvm.atan.f32(float %a)
   ret float %elt.atan
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @atan_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 17, half %{{.*}})
   %elt.atan = call half @llvm.atan.f16(half %a)
   ret half %elt.atan
 }
 
+define noundef <4 x float> @atan_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 17, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 17, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 17, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 17, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.atan.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.atan.f16(half)
 declare float @llvm.atan.f32(float)
+declare <4 x float> @llvm.atan.v4f32(<4 x float>) 

diff  --git a/llvm/test/CodeGen/DirectX/ceil.ll b/llvm/test/CodeGen/DirectX/ceil.ll
index 48bc5495a8e051..bd6e747c2fbf5f 100644
--- a/llvm/test/CodeGen/DirectX/ceil.ll
+++ b/llvm/test/CodeGen/DirectX/ceil.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for ceil are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.ceil
 }
 
+define noundef <4 x float> @ceil_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 28, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 28, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 28, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 28, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.ceil.f16(half)
 declare float @llvm.ceil.f32(float)
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) 

diff  --git a/llvm/test/CodeGen/DirectX/cos.ll b/llvm/test/CodeGen/DirectX/cos.ll
index 72f4bfca23f9d5..85f5db25570b90 100644
--- a/llvm/test/CodeGen/DirectX/cos.ll
+++ b/llvm/test/CodeGen/DirectX/cos.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for cos are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.cos
 }
 
+define noundef <4 x float> @cos_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.cos.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.cos.f16(half)
 declare float @llvm.cos.f32(float)
+declare <4 x float> @llvm.cos.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/cosh.ll b/llvm/test/CodeGen/DirectX/cosh.ll
index 91aaf893f3997c..670a8a3eae0864 100644
--- a/llvm/test/CodeGen/DirectX/cosh.ll
+++ b/llvm/test/CodeGen/DirectX/cosh.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for cosh are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @cosh_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 18, float %{{.*}})
   %elt.cosh = call float @llvm.cosh.f32(float %a)
   ret float %elt.cosh
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @cosh_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 18, half %{{.*}})
   %elt.cosh = call half @llvm.cosh.f16(half %a)
   ret half %elt.cosh
 }
 
+define noundef <4 x float> @cosh_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 18, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 18, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 18, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 18, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.cosh.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.cosh.f16(half)
 declare float @llvm.cosh.f32(float)
+declare <4 x float> @llvm.cosh.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/exp2.ll b/llvm/test/CodeGen/DirectX/exp2.ll
index b70b87dedc4d1e..6d16af6a5413e0 100644
--- a/llvm/test/CodeGen/DirectX/exp2.ll
+++ b/llvm/test/CodeGen/DirectX/exp2.ll
@@ -1,31 +1,39 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.7-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for exp2 are generated for float and half.
-; CHECK:call float @dx.op.unary.f32(i32 21, float %{{.*}})
-; CHECK:call half @dx.op.unary.f16(i32 21, half %{{.*}})
 
-target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-pc-shadermodel6.7-library"
-
-; Function Attrs: noinline nounwind optnone
-define noundef float @exp2_float(float noundef %a) #0 {
+define noundef float @exp2_float(float noundef %a) {
 entry:
-  %a.addr = alloca float, align 4
-  store float %a, ptr %a.addr, align 4
-  %0 = load float, ptr %a.addr, align 4
-  %elt.exp2 = call float @llvm.exp2.f32(float %0)
+  ; CHECK:call float @dx.op.unary.f32(i32 21, float %{{.*}})
+  %elt.exp2 = call float @llvm.exp2.f32(float %a)
   ret float %elt.exp2
 }
 
-; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
-declare float @llvm.exp2.f32(float) #1
-
-; Function Attrs: noinline nounwind optnone
-define noundef half @exp2_half(half noundef %a) #0 {
+define noundef half @exp2_half(half noundef %a) {
 entry:
-  %a.addr = alloca half, align 2
-  store half %a, ptr %a.addr, align 2
-  %0 = load half, ptr %a.addr, align 2
-  %elt.exp2 = call half @llvm.exp2.f16(half %0)
+  ; CHECK:call half @dx.op.unary.f16(i32 21, half %{{.*}})
+  %elt.exp2 = call half @llvm.exp2.f16(half %a)
   ret half %elt.exp2
 }
+
+define noundef <4 x float> @exp2_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 21, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 21, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 21, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 21, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
+declare float @llvm.exp2.f32(float)
+declare half @llvm.exp2.f16(half)
+declare  <4 x float> @llvm.exp2.v4f32(<4 x float> %a) 

diff  --git a/llvm/test/CodeGen/DirectX/fabs.ll b/llvm/test/CodeGen/DirectX/fabs.ll
index becbdf8d68aeb1..6d903f1c927ace 100644
--- a/llvm/test/CodeGen/DirectX/fabs.ll
+++ b/llvm/test/CodeGen/DirectX/fabs.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for abs are generated for float, half, and double.
 
@@ -27,6 +27,26 @@ entry:
   ret double %elt.abs
 }
 
+; CHECK-LABEL: fabs_float4
+define noundef <4 x float> @fabs_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 6, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 6, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 6, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 6, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.fabs.f16(half)
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/floor.ll b/llvm/test/CodeGen/DirectX/floor.ll
index f79f160e51e3b2..8ad81e1459a5ba 100644
--- a/llvm/test/CodeGen/DirectX/floor.ll
+++ b/llvm/test/CodeGen/DirectX/floor.ll
@@ -2,19 +2,38 @@
 
 ; Make sure dxil operation function calls for floor are generated for float and half.
 
-define noundef float @floor_float(float noundef %a) #0 {
+define noundef float @floor_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 27, float %{{.*}})
   %elt.floor = call float @llvm.floor.f32(float %a)
   ret float %elt.floor
 }
 
-define noundef half @floor_half(half noundef %a) #0 {
+define noundef half @floor_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 27, half %{{.*}})
   %elt.floor = call half @llvm.floor.f16(half %a)
   ret half %elt.floor
 }
 
+define noundef <4 x float> @floor_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.floor.f16(half)
 declare float @llvm.floor.f32(float)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/isinf.ll b/llvm/test/CodeGen/DirectX/isinf.ll
index 295776b0893476..03a00c40498d5a 100644
--- a/llvm/test/CodeGen/DirectX/isinf.ll
+++ b/llvm/test/CodeGen/DirectX/isinf.ll
@@ -1,25 +1,21 @@
 ; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for isinf are generated for float and half.
-; CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 9, float %{{.*}})
-; CHECK: call i1 @dx.op.isSpecialFloat.f16(i32 9, half %{{.*}})
 
-; Function Attrs: noinline nounwind optnone
-define noundef i1 @isinf_float(float noundef %a) #0 {
+define noundef i1 @isinf_float(float noundef %a) {
 entry:
-  %a.addr = alloca float, align 4
-  store float %a, ptr %a.addr, align 4
-  %0 = load float, ptr %a.addr, align 4
-  %dx.isinf = call i1 @llvm.dx.isinf.f32(float %0)
+  ; CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 9, float %{{.*}})
+  %dx.isinf = call i1 @llvm.dx.isinf.f32(float %a)
   ret i1 %dx.isinf
 }
 
-; Function Attrs: noinline nounwind optnone
-define noundef i1 @isinf_half(half noundef %p0) #0 {
+define noundef i1 @isinf_half(half noundef %a) {
 entry:
-  %p0.addr = alloca half, align 2
-  store half %p0, ptr %p0.addr, align 2
-  %0 = load half, ptr %p0.addr, align 2
-  %dx.isinf = call i1 @llvm.dx.isinf.f16(half %0)
+  ; CHECK: call i1 @dx.op.isSpecialFloat.f16(i32 9, half %{{.*}})
+  %dx.isinf = call i1 @llvm.dx.isinf.f16(half %a)
   ret i1 %dx.isinf
 }
+
+
+declare i1 @llvm.dx.isinf.f16(half)
+declare i1 @llvm.dx.isinf.f32(float)

diff  --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
new file mode 100644
index 00000000000000..36610bef719bf0
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=dxil-pc-shadermodel6.3-library -debug-pass=Structure < %s -o /dev/null 2>&1 | \
+; RUN:     grep -v "Verify generated machine code" | FileCheck %s
+
+; REQUIRES: asserts
+
+; CHECK-LABEL: Pass Arguments:
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT: ModulePass Manager
+; CHECK-NEXT:   DXIL Intrinsic Expansion
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     Scalarize vector operations
+; CHECK-NEXT:   DXIL Intrinsic Expansion
+; CHECK-NEXT:   DXIL Resource analysis
+; CHECK-NEXT:   DXIL Op Lowering
+; CHECK-NEXT:   DXIL Finalize Linkage
+; CHECK-NEXT:   DXIL Resource analysis
+; CHECK-NEXT:   DXIL resource Information
+; CHECK-NEXT:   DXIL Shader Flag Analysis
+; CHECK-NEXT:   DXIL Translate Metadata
+; CHECK-NEXT:   DXIL Prepare Module
+; CHECK-NEXT:   DXIL Resource analysis
+; CHECK-NEXT:   DXIL Metadata Pretty Printer
+; CHECK-NEXT:   Print Module IR
+ 

diff  --git a/llvm/test/CodeGen/DirectX/reversebits.ll b/llvm/test/CodeGen/DirectX/reversebits.ll
index 1ade57b40100ff..b5530d0850e663 100644
--- a/llvm/test/CodeGen/DirectX/reversebits.ll
+++ b/llvm/test/CodeGen/DirectX/reversebits.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for reversebits are generated for all integer types.
 
@@ -26,6 +26,25 @@ entry:
   ret i64 %elt.bitreverse
 }
 
+define noundef <4 x i32> @round_int324(<4 x i32> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x i32> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call i32 @dx.op.unary.i32(i32 30, i32 [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x i32> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call i32 @dx.op.unary.i32(i32 30, i32 [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x i32> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call i32 @dx.op.unary.i32(i32 30, i32 [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x i32> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call i32 @dx.op.unary.i32(i32 30, i32 [[ee3]])
+  ; CHECK: insertelement <4 x i32> poison, i32 [[ie0]], i64 0
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie1]], i64 1
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie2]], i64 2
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie3]], i64 3
+  %2 = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 
+  ret <4 x i32> %2
+}
+
 declare i16 @llvm.bitreverse.i16(i16)
 declare i32 @llvm.bitreverse.i32(i32)
 declare i64 @llvm.bitreverse.i64(i64)
+declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)

diff  --git a/llvm/test/CodeGen/DirectX/round.ll b/llvm/test/CodeGen/DirectX/round.ll
index db953fb29c2046..b08cbac5f42e91 100644
--- a/llvm/test/CodeGen/DirectX/round.ll
+++ b/llvm/test/CodeGen/DirectX/round.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for round are generated for float and half.
 
@@ -18,5 +18,25 @@ entry:
   ret float %elt.roundeven
 }
 
+define noundef <4 x float> @round_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 26, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 26, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 26, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 26, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
+
 declare half @llvm.roundeven.f16(half)
 declare float @llvm.roundeven.f32(float)
+declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/saturate.ll b/llvm/test/CodeGen/DirectX/saturate.ll
index a8557351756f2b..404cab7b665d0e 100644
--- a/llvm/test/CodeGen/DirectX/saturate.ll
+++ b/llvm/test/CodeGen/DirectX/saturate.ll
@@ -2,7 +2,7 @@
 ; Make sure the intrinsic dx.saturate is to appropriate DXIL op for half/float/double data types.
 
 ; CHECK-LABEL: test_saturate_half
-define noundef half @test_saturate_half(half noundef %p0) #0 {
+define noundef half @test_saturate_half(half noundef %p0) {
 entry:
   ; CHECK: call half @dx.op.unary.f16(i32 7, half %p0)
   %hlsl.saturate = call half @llvm.dx.saturate.f16(half %p0)
@@ -10,11 +10,8 @@ entry:
   ret half %hlsl.saturate
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn
-declare half @llvm.dx.saturate.f16(half) #1
-
 ; CHECK-LABEL: test_saturate_float
-define noundef float @test_saturate_float(float noundef %p0) #0 {
+define noundef float @test_saturate_float(float noundef %p0) {
 entry:
   ; CHECK: call float @dx.op.unary.f32(i32 7, float %p0)
   %hlsl.saturate = call float @llvm.dx.saturate.f32(float %p0)
@@ -22,11 +19,8 @@ entry:
   ret float %hlsl.saturate
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn
-declare float @llvm.dx.saturate.f32(float) #1
-
 ; CHECK-LABEL: test_saturate_double
-define noundef double @test_saturate_double(double noundef %p0) #0 {
+define noundef double @test_saturate_double(double noundef %p0) {
 entry:
   ; CHECK: call double @dx.op.unary.f64(i32 7, double %p0)
   %hlsl.saturate = call double @llvm.dx.saturate.f64(double %p0)
@@ -34,6 +28,7 @@ entry:
   ret double %hlsl.saturate
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn
-declare double @llvm.dx.saturate.f64(double) #1
+declare half @llvm.dx.saturate.f16(half)
+declare float @llvm.dx.saturate.f32(float)
+declare double @llvm.dx.saturate.f64(double)
 

diff  --git a/llvm/test/CodeGen/DirectX/scalar-store.ll b/llvm/test/CodeGen/DirectX/scalar-store.ll
new file mode 100644
index 00000000000000..b970a2842e5a8b
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalar-store.ll
@@ -0,0 +1,17 @@
+; RUN: opt -S -scalarizer -scalarize-load-store -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: llc %s -mtriple=dxil-pc-shadermodel6.3-library --filetype=asm -o - | FileCheck %s
+
+@"sharedData" = local_unnamed_addr addrspace(3) global [2 x <3 x float>] zeroinitializer, align 16 
+; CHECK-LABEL: store_test
+define void @store_test () local_unnamed_addr {
+    ; CHECK: store float 1.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+    ; CHECK: store float 2.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}}
+    ; CHECK: store float 3.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+    ; CHECK: store float 2.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+    ; CHECK: store float 4.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+    ; CHECK: store float 6.000000e+00, ptr addrspace(3) {{.*}}, align {{.*}} 
+
+    store <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, ptr addrspace(3) @"sharedData", align 16 
+    store <3 x float> <float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>, ptr addrspace(3)   getelementptr inbounds (i8, ptr addrspace(3) @"sharedData", i32 16), align 16 
+    ret void
+ } 

diff  --git a/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll b/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll
new file mode 100644
index 00000000000000..a14c1de5cc4205
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll
@@ -0,0 +1,25 @@
+; RUN: llc %s -mtriple=dxil-pc-shadermodel6.3-library --filetype=asm -o - | FileCheck %s
+
+; CHECK: target triple = "dxilv1.3-pc-shadermodel6.3-library"
+; CHECK-LABEL: cos_sin_float_test
+define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) {
+    ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+    ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee0]])
+    ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+    ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee1]])
+    ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+    ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee2]])
+    ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+    ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee3]])
+    ; CHECK: [[ie4:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie0]])
+    ; CHECK: [[ie5:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie1]])
+    ; CHECK: [[ie6:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie2]])
+    ; CHECK: [[ie7:%.*]] = call float @dx.op.unary.f32(i32 12, float [[ie3]])
+    ; CHECK: insertelement <4 x float> poison, float [[ie4]], i64 0
+    ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie5]], i64 1
+    ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie6]], i64 2
+    ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie7]], i64 3
+    %2 = tail call <4 x float> @llvm.sin.v4f32(<4 x float> %a) 
+    %3 = tail call <4 x float> @llvm.cos.v4f32(<4 x float> %2) 
+    ret <4 x float> %3 
+} 

diff  --git a/llvm/test/CodeGen/DirectX/sin.ll b/llvm/test/CodeGen/DirectX/sin.ll
index f309a36c6b8e6b..ac6b217be80e75 100644
--- a/llvm/test/CodeGen/DirectX/sin.ll
+++ b/llvm/test/CodeGen/DirectX/sin.ll
@@ -1,25 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sin are generated for float and half.
-; CHECK:call float @dx.op.unary.f32(i32 13, float %{{.*}})
-; CHECK:call half @dx.op.unary.f16(i32 13, half %{{.*}})
 
-; Function Attrs: noinline nounwind optnone
-define noundef float @sin_float(float noundef %a) #0 {
+define noundef float @sin_float(float noundef %a) {
 entry:
-  %a.addr = alloca float, align 4
-  store float %a, ptr %a.addr, align 4
-  %0 = load float, ptr %a.addr, align 4
-  %1 = call float @llvm.sin.f32(float %0)
+  ; CHECK:call float @dx.op.unary.f32(i32 13, float %{{.*}})
+  %1 = call float @llvm.sin.f32(float %a)
   ret float %1
 }
 
-; Function Attrs: noinline nounwind optnone
-define noundef half @sin_half(half noundef %a) #0 {
+define noundef half @sin_half(half noundef %a) {
 entry:
-  %a.addr = alloca half, align 2
-  store half %a, ptr %a.addr, align 2
-  %0 = load half, ptr %a.addr, align 2
-  %1 = call half @llvm.sin.f16(half %0)
+  ; CHECK:call half @dx.op.unary.f16(i32 13, half %{{.*}})
+  %1 = call half @llvm.sin.f16(half %a)
   ret half %1
 }
+
+define noundef <4 x float> @sin_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.sin.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
+declare half @llvm.sin.f16(half)
+declare float @llvm.sin.f32(float)
+declare <4 x float> @llvm.sin.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/sinh.ll b/llvm/test/CodeGen/DirectX/sinh.ll
index d4d3eda9eccb6c..deba726e8d9adc 100644
--- a/llvm/test/CodeGen/DirectX/sinh.ll
+++ b/llvm/test/CodeGen/DirectX/sinh.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sinh are generated for float and half.
 
-define noundef float @tan_float(float noundef %a) {
+define noundef float @sinh_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 19, float %{{.*}})
   %elt.sinh = call float @llvm.sinh.f32(float %a)
   ret float %elt.sinh
 }
 
-define noundef half @tan_half(half noundef %a) {
+define noundef half @sinh_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 19, half %{{.*}})
   %elt.sinh = call half @llvm.sinh.f16(half %a)
   ret half %elt.sinh
 }
 
+define noundef <4 x float> @sinh_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 19, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.sinh.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.sinh.f16(half)
 declare float @llvm.sinh.f32(float)
+declare <4 x float> @llvm.sinh.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/sqrt.ll b/llvm/test/CodeGen/DirectX/sqrt.ll
index 792fbc8d0614d3..e2955b4efa2ec4 100644
--- a/llvm/test/CodeGen/DirectX/sqrt.ll
+++ b/llvm/test/CodeGen/DirectX/sqrt.ll
@@ -1,20 +1,39 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sqrt are generated for float and half.
 
-define noundef float @sqrt_float(float noundef %a) #0 {
+define noundef float @sqrt_float(float noundef %a) {
 entry:
 ; CHECK:call float @dx.op.unary.f32(i32 24, float %{{.*}})
   %elt.sqrt = call float @llvm.sqrt.f32(float %a)
   ret float %elt.sqrt
 }
 
-define noundef half @sqrt_half(half noundef %a) #0 {
+define noundef half @sqrt_half(half noundef %a) {
 entry:
 ; CHECK:call half @dx.op.unary.f16(i32 24, half %{{.*}})
   %elt.sqrt = call half @llvm.sqrt.f16(half %a)
   ret half %elt.sqrt
 }
 
+define noundef <4 x float> @sqrt_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 24, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 24, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 24, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 24, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.sqrt.f16(half)
 declare float @llvm.sqrt.f32(float)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/tan.ll b/llvm/test/CodeGen/DirectX/tan.ll
index 6f7beb592339a9..cf6965a95c04e1 100644
--- a/llvm/test/CodeGen/DirectX/tan.ll
+++ b/llvm/test/CodeGen/DirectX/tan.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for tan are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.tan
 }
 
+define noundef <4 x float> @tan_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 14, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 14, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 14, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 14, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.tan.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.tan.f16(half)
 declare float @llvm.tan.f32(float)
+declare <4 x float> @llvm.tan.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/tanh.ll b/llvm/test/CodeGen/DirectX/tanh.ll
index e6642d9a74c8a3..54ec6f29fa0c3c 100644
--- a/llvm/test/CodeGen/DirectX/tanh.ll
+++ b/llvm/test/CodeGen/DirectX/tanh.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for tanh are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.tanh
 }
 
+define noundef <4 x float> @tanh_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 20, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 20, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 20, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 20, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.tanh.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.tanh.f16(half)
 declare float @llvm.tanh.f32(float)
+declare <4 x float> @llvm.tanh.v4f32(<4 x float>)

diff  --git a/llvm/test/CodeGen/DirectX/trunc.ll b/llvm/test/CodeGen/DirectX/trunc.ll
index f00b737da4dbb3..6d9c222595c448 100644
--- a/llvm/test/CodeGen/DirectX/trunc.ll
+++ b/llvm/test/CodeGen/DirectX/trunc.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for trunc are generated for float and half.
 
@@ -16,5 +16,24 @@ entry:
   ret half %elt.trunc
 }
 
+define noundef <4 x float> @trunc_float4(<4 x float> noundef %a) #0 {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 29, float [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call float @dx.op.unary.f32(i32 29, float [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x float> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call float @dx.op.unary.f32(i32 29, float [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x float> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call float @dx.op.unary.f32(i32 29, float [[ee3]])
+  ; CHECK: insertelement <4 x float> poison, float [[ie0]], i64 0
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie1]], i64 1
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie2]], i64 2
+  ; CHECK: insertelement <4 x float> %{{.*}}, float [[ie3]], i64 3
+  %2 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) 
+  ret <4 x float> %2
+}
+
 declare half @llvm.trunc.f16(half)
 declare float @llvm.trunc.f32(float)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)

diff  --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 1bdfa71830ba22..c5bc7b43e03314 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -375,6 +375,7 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
       "fix-irreducible",
       "expand-large-fp-convert",
       "callbrprepare",
+      "scalarizer",
   };
   for (const auto &P : PassNamePrefix)
     if (Pass.starts_with(P))