[llvm] [DirectX] Add support to lower LLVM intrinsics ceil, cos, fabs, floor and smax to DXIL Ops. (PR #78767)

Fri Jan 19 11:33:46 PST 2024

https://github.com/bharadwajy created https://github.com/llvm/llvm-project/pull/78767

Also add a DXILStrengthReduce pass to facilitate rewriting LLVM IR in preparation for DXIL lowering. Moved rewriting of FNeg from DXILModulePrepare to this pass and added support to rewrite abs intrinsic to the pass.

Add tests for each of the instructions being newly lowered and rewritten.

>From d2ccc13a822a68ce4b27a828e4f1d8e70295b362 Mon Sep 17 00:00:00 2001
From: Bharadwaj Yadavalli <Bharadwaj.Yadavalli at microsoft.com>
Date: Tue, 16 Jan 2024 11:38:21 -0500
Subject: [PATCH] [DirectX] Add support to lower LLVM intrinsics ceil, cos,
 fabs, floor and smax to DXIL Ops.

Also add a DXILStrengthReduce pass to facilitate rewriting LLVM IR in preparation
for DXIL lowering. Moved rewriting of FNeg from DXILModulePrepare to this pass and
added support to rewrite abs intrinsic to the pass.

Add tests for each of the instructions being newly lowered and rewritten.
---
 llvm/lib/Target/DirectX/CMakeLists.txt        |   1 +
 llvm/lib/Target/DirectX/DXIL.td               |  94 +++++++++---
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    |   2 +
 llvm/lib/Target/DirectX/DXILPrepare.cpp       |  15 +-
 .../lib/Target/DirectX/DXILStrengthReduce.cpp | 135 ++++++++++++++++++
 llvm/lib/Target/DirectX/DirectX.h             |   6 +
 .../Target/DirectX/DirectXTargetMachine.cpp   |   2 +
 llvm/test/CodeGen/DirectX/abs.ll              |  69 +++++++++
 llvm/test/CodeGen/DirectX/ceil.ll             |  42 ++++++
 llvm/test/CodeGen/DirectX/cos.ll              |  43 ++++++
 llvm/test/CodeGen/DirectX/fabs.ll             |  73 ++++++++++
 llvm/test/CodeGen/DirectX/floor.ll            |  42 ++++++
 llvm/test/CodeGen/DirectX/smax.ll             |  30 ++++
 13 files changed, 519 insertions(+), 35 deletions(-)
 create mode 100644 llvm/lib/Target/DirectX/DXILStrengthReduce.cpp
 create mode 100644 llvm/test/CodeGen/DirectX/abs.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ceil.ll
 create mode 100644 llvm/test/CodeGen/DirectX/cos.ll
 create mode 100644 llvm/test/CodeGen/DirectX/fabs.ll
 create mode 100644 llvm/test/CodeGen/DirectX/floor.ll
 create mode 100644 llvm/test/CodeGen/DirectX/smax.ll

diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index bf93280779bf8b..ef49fc9fa94491 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_target(DirectXCodeGen
   DXILResource.cpp
   DXILResourceAnalysis.cpp
   DXILShaderFlags.cpp
+  DXILStrengthReduce.cpp
   DXILTranslateMetadata.cpp
 
   LINK_COMPONENTS
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 709279889653b8..c71464561fafe0 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -28,6 +28,7 @@ def ThreadIdClass : dxil_class<"ThreadId">;
 def GroupIdClass : dxil_class<"GroupId">;
 
 def binary_uint : dxil_category<"Binary uint">;
+def binary_int : dxil_category<"Binary int">;
 def unary_float : dxil_category<"Unary float">;
 def ComputeID : dxil_category<"Compute/Mesh/Amplification shader">;
 
@@ -86,31 +87,78 @@ class dxil_op<string name, int code_id, dxil_class code_class, dxil_category op_
   let stats_group = _stats_group;
 }
 
-// The intrinsic which map directly to this dxil op.
+// Intrinsic that maps directly to this dxil op.
 class dxil_map_intrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
 
-def Sin : dxil_op<"Sin", 13, Unary, unary_float, "returns sine(theta) for theta in radians.",
-  "half;float;", "rn",
-  [
-    dxil_param<0, "$o", "", "operation result">,
-    dxil_param<1, "i32", "opcode", "DXIL opcode">,
-    dxil_param<2, "$o", "value", "input value">
-  ],
-  ["floats"]>,
-  dxil_map_intrinsic<int_sin>;
-
-def UMax :dxil_op< "UMax", 39,  Binary,  binary_uint, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
-    "i16;i32;i64;",  "rn",
-  [
-    dxil_param<0,  "$o",  "",  "operation result">,
-    dxil_param<1,  "i32",  "opcode",  "DXIL opcode">,
-    dxil_param<2,  "$o",  "a",  "input value">,
-    dxil_param<3,  "$o",  "b",  "input value">
-  ],
-  ["uints"]>,
-  dxil_map_intrinsic<int_umax>;
-
-def ThreadId :dxil_op< "ThreadId", 93,  ThreadIdClass, ComputeID, "reads the thread ID", "i32;",  "rn",
+def Fabs : dxil_op<"Fabs", 6, Unary, unary_float,
+                              "returns the absolute value of the input value.", 
+                              "half;float;double;", "rn",
+                              [
+                               dxil_param<0, "$o", "", "operation result">,
+                               dxil_param<1, "i32", "opcode", "DXIL opcode">,
+                               dxil_param<2, "$o", "value", "input value">
+                              ], ["floats"]>,
+                              dxil_map_intrinsic<int_fabs>;
+def Cos : dxil_op<"Cos", 12, Unary, unary_float,
+                              "returns cosine(theta) for theta in radians.", 
+                              "half;float;double;", "rn",
+                              [
+                               dxil_param<0, "$o", "", "operation result">,
+                               dxil_param<1, "i32", "opcode", "DXIL opcode">,
+                               dxil_param<2, "$o", "value", "input value">
+                              ], ["floats"]>,
+                              dxil_map_intrinsic<int_cos>;
+def Sin : dxil_op<"Sin", 13, Unary, unary_float,
+                              "returns sine(theta) for theta in radians.", 
+                              "half;float;double;", "rn",
+                              [
+                               dxil_param<0, "$o", "", "operation result">,
+                               dxil_param<1, "i32", "opcode", "DXIL opcode">,
+                               dxil_param<2, "$o", "value", "input value">
+                              ], ["floats"]>,
+                              dxil_map_intrinsic<int_sin>;
+
+def Round_ni : dxil_op<"Round_ni", 27, Unary, unary_float,
+                         "rounds towards -INF, commonly known as floor()", "float;double;",
+                         "rn",
+                         [
+                           dxil_param<0, "$o", "", "operation result">,
+                           dxil_param<1, "i32", "opcode", "DXIL opcode">,
+                           dxil_param<2, "$o", "value", "input value">
+                         ], ["floats"]>,
+                         dxil_map_intrinsic<int_floor>;
+
+def Round_pi : dxil_op<"Round_pi", 28, Unary, unary_float,
+                         "return natural log (log base e)", "float;double;", "rn",
+                         [
+                          dxil_param<0, "$o", "", "operation result">,
+                          dxil_param<1, "i32", "opcode", "DXIL opcode">,
+                          dxil_param<2, "$o", "value", "input value">
+                         ], ["floats"]>,
+                         dxil_map_intrinsic<int_ceil>;
+
+def IMax : dxil_op<"IMax", 37, Binary, binary_int,
+                               "IMax(a,b) returns a if a > b, else b", "i32;i64;", "rn",
+                                [
+                                  dxil_param<0,  "$o",  "",  "operation result">,
+                                  dxil_param<1,  "i32",  "opcode",  "DXIL opcode">,
+                                  dxil_param<2,  "$o",  "a",  "input value">,
+                                  dxil_param<3,  "$o",  "b",  "input value">
+                                ], ["ints"]>,
+                                dxil_map_intrinsic<int_smax>;
+
+def UMax : dxil_op<"UMax", 39, Binary, binary_uint,
+                               "unsigned integer maximum. UMax(a,b) = a > b ? a : b", "i32;i64;", "rn",
+                               [
+                                  dxil_param<0,  "$o",  "",  "operation result">,
+                                  dxil_param<1,  "i32",  "opcode",  "DXIL opcode">,
+                                  dxil_param<2,  "$o",  "a",  "input value">,
+                                  dxil_param<3,  "$o",  "b",  "input value">
+                                ], ["uints"]>,
+                               dxil_map_intrinsic<int_umax>;
+
+// ThreadID and GroupId
+def ThreadId : dxil_op< "ThreadId", 93,  ThreadIdClass, ComputeID, "reads the thread ID", "i32;",  "rn",
   [
     dxil_param<0,  "i32",  "",  "thread ID component">,
     dxil_param<1,  "i32",  "opcode",  "DXIL opcode">,
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index f6e2297e9af41f..cbfd65e27983f3 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -45,6 +45,8 @@ static void lowerIntrinsic(dxil::OpCode DXILOp, Function &F, Module &M) {
     Args.append(CI->arg_begin(), CI->arg_end());
     B.SetInsertPoint(CI);
     CallInst *DXILCI = DXILB.createDXILOpCall(DXILOp, OverloadTy, CI->args());
+    // Retain tail call property
+    DXILCI->setTailCall(CI->isTailCall());
 
     CI->replaceAllUsesWith(DXILCI);
     CI->eraseFromParent();
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 026911946b47f0..aa0012114548a0 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// \file This file contains pases and utilities to convert a modern LLVM
+/// \file This file contains passes and utilities to convert a modern LLVM
 /// module into a module compatible with the LLVM 3.7-based DirectX Intermediate
 /// Language (DXIL).
 //===----------------------------------------------------------------------===//
@@ -119,17 +119,8 @@ class DXILPrepareModule : public ModulePass {
       for (auto &BB : F) {
         IRBuilder<> Builder(&BB);
         for (auto &I : make_early_inc_range(BB)) {
-          if (I.getOpcode() == Instruction::FNeg) {
-            Builder.SetInsertPoint(&I);
-            Value *In = I.getOperand(0);
-            Value *Zero = ConstantFP::get(In->getType(), -0.0);
-            I.replaceAllUsesWith(Builder.CreateFSub(Zero, In));
-            I.eraseFromParent();
-            continue;
-          }
-
-          // Emtting NoOp bitcast instructions allows the ValueEnumerator to be
-          // unmodified as it reserves instruction IDs during contruction.
+          // Emitting NoOp bitcast instructions allows the ValueEnumerator to be
+          // unmodified as it reserves instruction IDs during construction.
           if (auto LI = dyn_cast<LoadInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, LI->getPointerOperand(),
diff --git a/llvm/lib/Target/DirectX/DXILStrengthReduce.cpp b/llvm/lib/Target/DirectX/DXILStrengthReduce.cpp
new file mode 100644
index 00000000000000..55ccad5f539b36
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILStrengthReduce.cpp
@@ -0,0 +1,135 @@
+//===- DXILStrengthReduce.cpp - Prepare LLVM Module for DXIL encoding------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file contains strength reduction pass to convert a modern LLVM
+/// module into an LLVM module with LLVM intrinsics amenable for lowering to
+/// LLVM 3.7-based DirectX Intermediate Language (DXIL).
+//===----------------------------------------------------------------------===//
+
+#include "DirectX.h"
+#include "DirectXIRPasses/PointerTypeAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+
+#define DEBUG_TYPE "dxil-strength-reduce"
+
+using namespace llvm;
+using namespace llvm::dxil;
+
+namespace {
+class DXILStrengthReduce : public ModulePass {
+
+public:
+  bool runOnModule(Module &M) override {
+    for (auto &F : make_early_inc_range(M.functions())) {
+      IRBuilder<> IRB(M.getContext());
+      // Reduce strength of LLVM intrinsics
+      // Flag to indicate if the intrinsic has been replaced. This ensures any
+      // other functions with no uses are not deleted in this pass.
+      bool IntrinsicReplaced = false;
+      if (F.isDeclaration()) {
+        Intrinsic::ID IntrinsicId = F.getIntrinsicID();
+        // Convert
+        //    %ret = call i32 @llvm.abs.i32(i32 %arg, i1 false)
+        // to
+        //    %NegArg = sub 0, %arg
+        //    %ret = call i32 @llvm.imax(NegArg, %arg)
+        if (IntrinsicId == Intrinsic::abs) {
+          // Get to uses of the intrinsic
+          for (User *U : make_early_inc_range(F.users())) {
+            auto *IntrinsicCall = dyn_cast<CallInst>(U);
+            if (!IntrinsicCall)
+              continue;
+            Value *Input = IntrinsicCall->getOperand(0);
+            Value *Poison = IntrinsicCall->getOperand(1);
+
+            // Get Poison argument value
+            const ConstantInt *CI = dyn_cast<ConstantInt>(Poison);
+            assert(
+                CI != nullptr &&
+                "Expect second argument of abs intrinsic to be constant type.");
+            assert(CI->getType()->isIntegerTy(1) &&
+                   "Expect second argument of abs intrinsic to be constant int "
+                   "type.");
+            bool isPoison = CI->getZExtValue();
+
+            // Construct the Instruction sub(0, Input)
+            Value *ZeroValue = ConstantInt::get(Input->getType(), 0);
+            IRB.SetInsertPoint(IntrinsicCall);
+            auto *SubInst =
+                IRB.CreateSub(ZeroValue, Input, "NegArg", isPoison, isPoison);
+
+            // Replace
+            //   call i32 @llvm.abs.i32(i32 %arg, i1 false)
+            // with
+            //   call i32 @llvm.max.i32(i32 %NegArg, %arg)
+            // Generate Intrinsic function call
+            Value *IntrinsicCallArgs[] = {Input, SubInst};
+            auto *IMaxCall = IRB.CreateIntrinsic(
+                Input->getType(), Intrinsic::smax,
+                ArrayRef<Value *>(IntrinsicCallArgs), nullptr, "IMax");
+            // Retain the tail call and attributes of the intrinsic being
+            // replaced.
+            IMaxCall->setTailCall(IntrinsicCall->isTailCall());
+            IMaxCall->setAttributes(IntrinsicCall->getAttributes());
+            IntrinsicCall->replaceAllUsesWith(IMaxCall);
+            IntrinsicCall->eraseFromParent();
+            IntrinsicReplaced = true;
+          }
+        }
+        if (F.user_empty() && IntrinsicReplaced)
+          F.eraseFromParent();
+
+      } else {
+        // Reduce strength of instructions
+        for (auto &BB : F) {
+          IRBuilder<> Builder(&BB);
+          for (auto &I : make_early_inc_range(BB)) {
+            // Rewrite
+            //    %nval = fneg double %val
+            // to
+            //    %nval = fsub double -0.000000e+00, %val
+
+            if (I.getOpcode() == Instruction::FNeg) {
+              Builder.SetInsertPoint(&I);
+              Value *In = I.getOperand(0);
+              Value *Zero = ConstantFP::get(In->getType(), -0.0);
+              I.replaceAllUsesWith(Builder.CreateFSub(Zero, In));
+              I.eraseFromParent();
+            }
+          }
+        }
+      }
+    }
+    return true;
+  }
+
+  DXILStrengthReduce() : ModulePass(ID) {}
+
+  static char ID; // Pass identification.
+};
+char DXILStrengthReduce::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(DXILStrengthReduce, DEBUG_TYPE, "DXIL Strength Reduce",
+                      false, false)
+INITIALIZE_PASS_END(DXILStrengthReduce, DEBUG_TYPE, "DXIL Strength Reduce",
+                    false, false)
+
+ModulePass *llvm::createDXILStrengthReducePass() {
+  return new DXILStrengthReduce();
+}
diff --git a/llvm/lib/Target/DirectX/DirectX.h b/llvm/lib/Target/DirectX/DirectX.h
index eaecc3ac280c4c..51f2791ba9cc0b 100644
--- a/llvm/lib/Target/DirectX/DirectX.h
+++ b/llvm/lib/Target/DirectX/DirectX.h
@@ -28,6 +28,12 @@ void initializeDXILPrepareModulePass(PassRegistry &);
 /// Pass to convert modules into DXIL-compatable modules
 ModulePass *createDXILPrepareModulePass();
 
+/// Initializer for DXIL strength reduce
+void initializeDXILStrengthReducePass(PassRegistry &);
+
+/// Pass to reduce strength during lowering into DXIL-compatable modules
+ModulePass *createDXILStrengthReducePass();
+
 /// Initializer for DXILOpLowering
 void initializeDXILOpLoweringLegacyPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 06938f8c74f155..2cd8f049e011d2 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -39,6 +39,7 @@ using namespace llvm;
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
   auto *PR = PassRegistry::getPassRegistry();
+  initializeDXILStrengthReducePass(*PR);
   initializeDXILPrepareModulePass(*PR);
   initializeEmbedDXILPassPass(*PR);
   initializeWriteDXILPassPass(*PR);
@@ -76,6 +77,7 @@ class DirectXPassConfig : public TargetPassConfig {
 
   FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
   void addCodeGenPrepare() override {
+    addPass(createDXILStrengthReducePass());
     addPass(createDXILOpLoweringLegacyPass());
     addPass(createDXILPrepareModulePass());
     addPass(createDXILTranslateMetadataPass());
diff --git a/llvm/test/CodeGen/DirectX/abs.ll b/llvm/test/CodeGen/DirectX/abs.ll
new file mode 100644
index 00000000000000..fa6b33ad749728
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/abs.ll
@@ -0,0 +1,69 @@
+; Make sure dxil operation function calls for abs are appropriately strength reduced for int and int64_t.
+; RUN: opt -S -dxil-strength-reduce < %s | FileCheck %s -check-prefix=TEST_SR
+
+; Make sure output of strength reduction pass is lowered to DXIL code as expected.
+; RUN: opt -S -dxil-strength-reduce -dxil-op-lower < %s | FileCheck %s -check-prefix=TEST_SR_OP_LOWER
+
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-unknown-shadermodel6.5-compute"
+
+@"?a@@3HA" = local_unnamed_addr global i32 0, align 4
+@"?b@@3HA" = local_unnamed_addr global i32 0, align 4
+@"?c@@3JA" = local_unnamed_addr global i64 0, align 8
+@"?d@@3JA" = local_unnamed_addr global i64 0, align 8
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @"?test_i32@@YAXXZ"() local_unnamed_addr #1 {
+entry:
+  %0 = load i32, ptr @"?b@@3HA", align 4, !tbaa !4
+  ; TEST_SR:%NegArg = sub i32 0, %0
+  ; TEST_SR-NEXT: %IMax = tail call i32 @llvm.smax.i32(i32 %0, i32 %NegArg)
+  ; TEST_SR_OP_LOWER: %NegArg = sub i32 0, %0
+  ; TEST_SR_OP_LOWER-NEXT:%1 = tail call i32 @dx.op.binary.i32(i32 37, i32 %0, i32 %NegArg)
+  %elt.abs = tail call i32 @llvm.abs.i32(i32 %0, i1 false)
+  ; TEST_SR: store i32 %IMax, ptr @"?a@@3HA", align 4, !tbaa !4
+  ; TEST_SR_OP_LOWER: store i32 %1, ptr @"?a@@3HA", align 4, !tbaa !4
+  store i32 %elt.abs, ptr @"?a@@3HA", align 4, !tbaa !4
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.abs.i32(i32, i1 immarg) #2
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @"?test_i64@@YAXI at Z"(i32 noundef %GI) local_unnamed_addr #1 {
+entry:
+  %0 = load i64, ptr @"?d@@3JA", align 8, !tbaa !8
+  ; TEST_SR: %NegArg = sub i64 0, %0
+  ; TEST_SR-NEXT: %IMax = tail call i64 @llvm.smax.i64(i64 %0, i64 %NegArg)
+  ; TEST_SR_OP_LOWER: %NegArg = sub i64 0, %0
+  ; TEST_SR_OP_LOWER-NEXT: %1 = tail call i64 @dx.op.binary.i64(i32 37, i64 %0, i64 %NegArg)
+  %elt.abs = tail call i64 @llvm.abs.i64(i64 %0, i1 false)
+  ; TEST_SR: store i64 %IMax, ptr @"?c@@3JA", align 8, !tbaa !8
+  ; TEST_SR_OP_LOWER: store i64 %1, ptr @"?c@@3JA", align 8, !tbaa !8
+  store i64 %elt.abs, ptr @"?c@@3JA", align 8, !tbaa !8
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.abs.i64(i64, i1 immarg) #2
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1}
+!dx.valver = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 7}
+!3 = !{!"clang version 18.0.0git (git at github.com:somefork/llvm-project.git someSHA)"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"long", !6, i64 0}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/DirectX/ceil.ll b/llvm/test/CodeGen/DirectX/ceil.ll
new file mode 100644
index 00000000000000..3b1835386f28db
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ceil.ll
@@ -0,0 +1,42 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for ceil are generated for float and double.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; Function Attrs: noinline nounwind optnone
+define noundef float @_Z3foof(float noundef %a) #0 {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, ptr %a.addr, align 4
+  %0 = load float, ptr %a.addr, align 4
+  ; CHECK:call float @dx.op.unary.f32(i32 28, float %{{.*}})
+  %1 = call float @llvm.ceil.f32(float %0)
+  ret float %1
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.ceil.f32(float) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef double @_Z3barDh(double noundef %a) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  store double %a, ptr %a.addr, align 8
+  %0 = load double, ptr %a.addr, align 8
+  ; CHECK:call double @dx.op.unary.f64(i32 28, double %{{.*}})
+  %1 = call double @llvm.ceil.f64(double %0)
+  ret double %1
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare double @llvm.ceil.f64(double) #1
+
+attributes #0 = { noinline nounwind optnone "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git 73417c517644db5c419c85c0b3cb6750172fcab5)"}
diff --git a/llvm/test/CodeGen/DirectX/cos.ll b/llvm/test/CodeGen/DirectX/cos.ll
new file mode 100644
index 00000000000000..b1813435ec8b7a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/cos.ll
@@ -0,0 +1,43 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for cos are generated for float and half.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; Function Attrs: noinline nounwind optnone
+define noundef float @_Z3foof(float noundef %a) #0 {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, ptr %a.addr, align 4
+  %0 = load float, ptr %a.addr, align 4
+  ; CHECK:call float @dx.op.unary.f32(i32 12, float %{{.*}})
+  %1 = call float @llvm.cos.f32(float %0)
+  ret float %1
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.cos.f32(float) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef half @_Z3barDh(half noundef %a) #0 {
+entry:
+  %a.addr = alloca half, align 2
+  store half %a, ptr %a.addr, align 2
+  %0 = load half, ptr %a.addr, align 2
+  ; CHECK:call half @dx.op.unary.f16(i32 12, half %{{.*}})
+  %1 = call half @llvm.cos.f16(half %0)
+  ret half %1
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare half @llvm.cos.f16(half) #1
+
+attributes #0 = { noinline nounwind optnone "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git 73417c517644db5c419c85c0b3cb6750172fcab5)"}
diff --git a/llvm/test/CodeGen/DirectX/fabs.ll b/llvm/test/CodeGen/DirectX/fabs.ll
new file mode 100644
index 00000000000000..0118b0db2418f8
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/fabs.ll
@@ -0,0 +1,73 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for fabs are generated for half, float and double.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-unknown-shadermodel6.5-compute"
+
+@"?h1@@3$halff at A" = local_unnamed_addr global float 0.000000e+00, align 4
+@"?h2@@3$halff at A" = local_unnamed_addr global float 0.000000e+00, align 4
+@"?f1@@3MA" = local_unnamed_addr global float 0.000000e+00, align 4
+@"?f2@@3MA" = local_unnamed_addr global float 0.000000e+00, align 4
+@"?d1@@3NA" = local_unnamed_addr global double 0.000000e+00, align 8
+@"?d2@@3NA" = local_unnamed_addr global double 0.000000e+00, align 8
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @"?test_half@@YAXXZ"() local_unnamed_addr #1 {
+entry:
+  %0 = load float, ptr @"?h2@@3$halff at A", align 4, !tbaa !4
+  ; CHECK: %1 = tail call float @dx.op.unary.f32(i32 6, float %0)
+  %elt.abs = tail call float @llvm.fabs.f32(float %0)
+  ; CHECK: store float %1, ptr @"?h1@@3$halff at A", align 4, !tbaa !4
+  store float %elt.abs, ptr @"?h1@@3$halff at A", align 4, !tbaa !4
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.fabs.f32(float) #2
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @"?test_float@@YAXXZ"() local_unnamed_addr #1 {
+entry:
+  %0 = load float, ptr @"?f2@@3MA", align 4, !tbaa !8
+  ; CHECK: %1 = tail call float @dx.op.unary.f32(i32 6, float %0)
+  %elt.abs = tail call float @llvm.fabs.f32(float %0)
+  ; CHECK: store float %1, ptr @"?f1@@3MA", align 4, !tbaa !8
+  store float %elt.abs, ptr @"?f1@@3MA", align 4, !tbaa !8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @"?test_double@@YAXXZ"() local_unnamed_addr #1 {
+entry:
+  %0 = load double, ptr @"?d2@@3NA", align 8, !tbaa !10
+  ; CHECK: %1 = tail call double @dx.op.unary.f64(i32 6, double %0)
+  %elt.abs = tail call double @llvm.fabs.f64(double %0)
+  ; CHECK: store double %1, ptr @"?d1@@3NA", align 8, !tbaa !10
+  store double %elt.abs, ptr @"?d1@@3NA", align 8, !tbaa !10
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare double @llvm.fabs.f64(double) #2
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1}
+!dx.valver = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 7}
+!3 = !{!"clang version 18.0.0git (git at github.com:somefork/llvm-project.git someSHA)"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"half", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"float", !6, i64 0}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"double", !6, i64 0}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/DirectX/floor.ll b/llvm/test/CodeGen/DirectX/floor.ll
new file mode 100644
index 00000000000000..aba2117b5e7a33
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/floor.ll
@@ -0,0 +1,42 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for floor are generated for float and double.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; Function Attrs: noinline nounwind optnone
+define noundef float @_Z3foof(float noundef %a) #0 {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, ptr %a.addr, align 4
+  %0 = load float, ptr %a.addr, align 4
+  ; CHECK:call float @dx.op.unary.f32(i32 27, float %{{.*}})
+  %1 = call float @llvm.floor.f32(float %0)
+  ret float %1
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare float @llvm.floor.f32(float) #1
+
+; Function Attrs: noinline nounwind optnone
+define noundef double @_Z3barDh(double noundef %a) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  store double %a, ptr %a.addr, align 8
+  %0 = load double, ptr %a.addr, align 8
+  ; CHECK:call double @dx.op.unary.f64(i32 27, double %{{.*}})
+  %1 = call double @llvm.floor.f64(double %0)
+  ret double %1
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare double @llvm.floor.f64(double) #1
+
+attributes #0 = { noinline nounwind optnone "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git 73417c517644db5c419c85c0b3cb6750172fcab5)"}
diff --git a/llvm/test/CodeGen/DirectX/smax.ll b/llvm/test/CodeGen/DirectX/smax.ll
new file mode 100644
index 00000000000000..3fefb4877205c1
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/smax.ll
@@ -0,0 +1,30 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for smax are generated for i32/i64.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; CHECK-LABEL:test_smax_i32
+; Function Attrs: noinline nounwind optnone
+define noundef i32 @test_smax_i32(i32 noundef %a, i32 noundef %b) #0 {
+entry:
+; CHECK:call i32 @dx.op.binary.i32(i32 37, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_smax_i64
+define noundef i64 @test_smax_i64(i64 noundef %a, i64 noundef %b) #0 {
+entry:
+; CHECK:call i64 @dx.op.binary.i64(i32 37, i64 %{{.*}}, i64 %{{.*}})
+  %0 = call i64 @llvm.smax.i64(i64 %a, i64 %b)
+  ret i64 %0
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32) #1
+declare i64 @llvm.smax.i64(i64, i64) #1
+
+attributes #0 = { noinline nounwind }
+attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }