[clang] [CIR]Upstream generic intrinsic emission path (PR #179098)

Wed Feb 11 09:57:45 PST 2026

https://github.com/Priyanshu3820 updated https://github.com/llvm/llvm-project/pull/179098

>From 5f1482a60cb7a179ca6a119505bb37b43d15f454 Mon Sep 17 00:00:00 2001
From: Priyanshu <10b.priyanshu at gmail.com>
Date: Sun, 1 Feb 2026 15:38:02 +0530
Subject: [PATCH 01/13] Upstream generic intrinsic emission path

---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       | 200 ++++++++++++++++++
 .../CIR/CodeGenBuiltins/X86/rd-builtins.c     |  25 +++
 2 files changed, 225 insertions(+)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 5e6c9e8e2490e..5fd70df43a5f6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -26,6 +26,7 @@
 #include "clang/Basic/OperatorKinds.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace clang;
@@ -726,6 +727,108 @@ static RValue tryEmitFPMathIntrinsic(CIRGenFunction &cgf, const CallExpr *e,
   return RValue::getIgnored();
 }
 
+static mlir::Type
+decodeFixedType(ArrayRef<llvm::Intrinsic::IITDescriptor> &infos,
+                mlir::MLIRContext *context) {
+  using namespace llvm::Intrinsic;
+
+  IITDescriptor descriptor = infos.front();
+  infos = infos.slice(1);
+
+  switch (descriptor.Kind) {
+  case IITDescriptor::Void:
+    return cir::VoidType::get(context);
+  case IITDescriptor::Integer:
+    return cir::IntType::get(context, descriptor.Integer_Width,
+                             /*isSigned=*/true);
+  case IITDescriptor::Float:
+    return cir::SingleType::get(context);
+  case IITDescriptor::Double:
+    return cir::DoubleType::get(context);
+  default:
+    llvm_unreachable("NYI");
+  }
+}
+
+/// Helper function to correct integer signedness for intrinsic arguments.
+/// IIT always returns signed integers, but the actual intrinsic may expect
+/// unsigned integers based on the AST FunctionDecl parameter types.
+static mlir::Type getIntrinsicArgumentTypeFromAST(mlir::Type iitType,
+                                                  const CallExpr *E,
+                                                  unsigned argIndex,
+                                                  mlir::MLIRContext *context) {
+  // If it's not an integer type, return as-is
+  auto intTy = dyn_cast<cir::IntType>(iitType);
+  if (!intTy)
+    return iitType;
+
+  // Get the FunctionDecl from the CallExpr
+  const FunctionDecl *FD = nullptr;
+  if (const auto *DRE =
+          dyn_cast<DeclRefExpr>(E->getCallee()->IgnoreImpCasts())) {
+    FD = dyn_cast<FunctionDecl>(DRE->getDecl());
+  }
+
+  // If we have FunctionDecl and this argument exists, check its signedness
+  if (FD && argIndex < FD->getNumParams()) {
+    QualType paramType = FD->getParamDecl(argIndex)->getType();
+    if (paramType->isUnsignedIntegerType()) {
+      // Create unsigned version of the type
+      return cir::IntType::get(context, intTy.getWidth(), /*isSigned=*/false);
+    }
+  }
+
+  // Default: keep IIT type (signed)
+  return iitType;
+}
+
+static mlir::Value getCorrectedPtr(mlir::Value argValue, mlir::Type expectedTy,
+                                   CIRGenBuilderTy &builder) {
+  auto ptrType = mlir::dyn_cast<cir::PointerType>(argValue.getType());
+  assert(ptrType && "expected pointer type");
+
+  auto expectedPtrType = mlir::cast<cir::PointerType>(expectedTy);
+  assert(ptrType.getPointee() != expectedPtrType.getPointee() &&
+         "types should not match");
+
+  if (ptrType.getAddrSpace() != expectedPtrType.getAddrSpace()) {
+    auto newPtrType = cir::PointerType::get(ptrType.getPointee(),
+                                            expectedPtrType.getAddrSpace());
+    return builder.createAddrSpaceCast(argValue, newPtrType);
+  }
+
+  return argValue;
+}
+
+static cir::FuncType getIntrinsicType(mlir::MLIRContext *context,
+                                      llvm::Intrinsic::ID id) {
+  using namespace llvm::Intrinsic;
+
+  SmallVector<IITDescriptor, 8> table;
+  getIntrinsicInfoTableEntries(id, table);
+
+  ArrayRef<IITDescriptor> tableRef = table;
+  mlir::Type resultTy = decodeFixedType(tableRef, context);
+
+  SmallVector<mlir::Type, 8> argTypes;
+  bool isVarArg = false;
+  while (!tableRef.empty()) {
+    auto kind = tableRef.front().Kind;
+    if (kind == IITDescriptor::VarArg) {
+      isVarArg = true;
+      break; // VarArg is last
+    }
+    argTypes.push_back(decodeFixedType(tableRef, context));
+  }
+
+  // CIR convention: no explicit void return type
+  if (isa<cir::VoidType>(resultTy))
+    return cir::FuncType::get(context, argTypes, /*optionalReturnType=*/nullptr,
+                              isVarArg);
+
+  return cir::FuncType::get(context, argTypes, resultTy, isVarArg);
+}
+
 RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
                                        const CallExpr *e,
                                        ReturnValueSlot returnValue) {
@@ -1816,6 +1919,103 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitLibraryCall(*this, fd, e,
                            cgm.getBuiltinLibFunction(fd, builtinID));
 
+  // See if we have a target specific intrinsic.
+  std::string name = getContext().BuiltinInfo.getName(builtinID);
+  Intrinsic::ID intrinsicID = Intrinsic::not_intrinsic;
+  StringRef prefix =
+      llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch());
+  if (!prefix.empty()) {
+    intrinsicID = Intrinsic::getIntrinsicForClangBuiltin(prefix.data(), name);
+    // NOTE we don't need to perform a compatibility flag check here since the
+    // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the
+    // MS builtins via ALL_MS_LANGUAGES and are filtered earlier.
+    if (intrinsicID == Intrinsic::not_intrinsic)
+      intrinsicID = Intrinsic::getIntrinsicForMSBuiltin(prefix.data(), name);
+  }
+
+  if (intrinsicID != Intrinsic::not_intrinsic) {
+    unsigned iceArguments = 0;
+    ASTContext::GetBuiltinTypeError error;
+    getContext().GetBuiltinType(builtinID, error, &iceArguments);
+    assert(error == ASTContext::GE_None && "Should not codegen an error");
+
+    llvm::StringRef name = llvm::Intrinsic::getName(intrinsicID);
+    // cir::LLVMIntrinsicCallOp expects intrinsic name to not have prefix
+    // "llvm." For example, `llvm.nvvm.barrier0` should be passed as
+    // `nvvm.barrier0`.
+    if (!name.consume_front("llvm."))
+      assert(false && "bad intrinsic name!");
+
+    cir::FuncType intrinsicType =
+        getIntrinsicType(&getMLIRContext(), intrinsicID);
+
+    SmallVector<mlir::Value> args;
+    for (unsigned i = 0; i < e->getNumArgs(); i++) {
+      mlir::Value argValue =
+          emitScalarOrConstFoldImmArg(iceArguments, i, e->getArg(i));
+      // If the intrinsic arg type is different from the builtin arg type
+      // we need to do a bit cast.
+      mlir::Type argType = argValue.getType();
+      mlir::Type expectedTy = intrinsicType.getInput(i);
+
+      // Use helper to get the correct integer type based on AST signedness
+      mlir::Type correctedExpectedTy =
+          getIntrinsicArgumentTypeFromAST(expectedTy, e, i, &getMLIRContext());
+
+      if (argType != correctedExpectedTy)
+        argValue = getCorrectedPtr(argValue, expectedTy, builder);
+
+      args.push_back(argValue);
+    }
+
+    cir::LLVMIntrinsicCallOp intrinsicCall = cir::LLVMIntrinsicCallOp::create(
+        builder, getLoc(e->getExprLoc()), builder.getStringAttr(name),
+        intrinsicType.getReturnType(), args);
+
+    // Convert the intrinsic result to the CallExpr/AST expected return type if
+    // they differ. This can happen when an intrinsic's IIT uses a signed
+    // integer type while the AST declares an unsigned type, or when an
+    // intrinsic returns an integer but the AST expects a pointer (or vice
+    // versa). Coerce conservatively so subsequent stores/verifications succeed.
+    mlir::Value intrinsicRes = intrinsicCall.getResult();
+    mlir::Type builtinReturnType = intrinsicRes.getType();
+    mlir::Type expectedRetTy = convertType(e->getType());
+
+    if (builtinReturnType != expectedRetTy) {
+      // Integer -> Integer or width/signage differences.
+      if (cir::IntType fromInt =
+              mlir::dyn_cast<cir::IntType>(builtinReturnType)) {
+        if (cir::IntType toInt = mlir::dyn_cast<cir::IntType>(expectedRetTy))
+          intrinsicRes = builder.createIntCast(intrinsicRes, expectedRetTy);
+        else if (mlir::dyn_cast<cir::PointerType>(expectedRetTy))
+          intrinsicRes = builder.createIntToPtr(intrinsicRes, expectedRetTy);
+        else
+          intrinsicRes = builder.createBitcast(intrinsicRes, expectedRetTy);
+
+      } else if (cir::PointerType fromPtr =
+                     mlir::dyn_cast<cir::PointerType>(builtinReturnType)) {
+        if (mlir::dyn_cast<cir::IntType>(expectedRetTy))
+          intrinsicRes = builder.createPtrToInt(intrinsicRes, expectedRetTy);
+        else if (cir::PointerType toPtr =
+                     mlir::dyn_cast<cir::PointerType>(expectedRetTy)) {
+          if (fromPtr.getAddrSpace() != toPtr.getAddrSpace())
+            intrinsicRes =
+                builder.createAddrSpaceCast(intrinsicRes, expectedRetTy);
+          else if (fromPtr.getPointee() != toPtr.getPointee())
+            intrinsicRes = builder.createBitcast(intrinsicRes, expectedRetTy);
+        } else
+          intrinsicRes = builder.createBitcast(intrinsicRes, expectedRetTy);
+
+      } else
+        intrinsicRes = builder.createBitcast(intrinsicRes, expectedRetTy);
+    }
+
+    if (isa<cir::VoidType>(expectedRetTy))
+      return RValue::get(nullptr);
+
+    return RValue::get(intrinsicRes);
+  }
+
   // Some target-specific builtins can have aggregate return values, e.g.
   // __builtin_arm_mve_vld2q_u32. So if the result is an aggregate, force
   // returnValue to be non-null, so that the target-specific emission code can
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c
new file mode 100644
index 0000000000000..a1ac394110e39
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c
@@ -0,0 +1,25 @@
+// RUN: %clang -target x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -fclangir -S -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -S -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+#include <x86intrin.h>
+
+unsigned long long test_rdpmc(int a) {
+    // CIR-LABEL: test_rdpmc
+    // CIR: cir.call @__rdpmc
+    // CIR: cir.store %{{.*}}, %{{.*}} : !u64i, !cir.ptr<!u64i>
+    // CIR: cir.return %{{.*}} : !u64i
+    
+    // LLVM-LABEL: @test_rdpmc
+    // LLVM: call i64 @llvm.x86.rdpmc
+    // LLVM: store i64 %{{.*}}, ptr %{{.*}}, align 8
+    // LLVM: ret i64 %{{.*}}
+
+    // OGCG-LABEL: @test_rdpmc
+    // OGCG: call i64 @llvm.x86.rdpmc
+    // OGCG: ret i64 %{{.*}}
+    return _rdpmc(a);
+}

>From 440ac55b0d95f24c7fbaa0df3085ca43c225876f Mon Sep 17 00:00:00 2001
From: Priyanshu <10b.priyanshu at gmail.com>
Date: Sun, 1 Feb 2026 22:33:03 +0530
Subject: [PATCH 02/13] Update test

---
 clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c
index a1ac394110e39..a66302c50cec5 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c
@@ -1,12 +1,16 @@
-// RUN: %clang -target x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
 // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
-// RUN: %clang -target x86_64-unknown-linux-gnu -fclangir -S -emit-llvm %s -o %t-cir.ll
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
 // RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
-// RUN: %clang -target x86_64-unknown-linux-gnu -S -emit-llvm %s -o %t.ll
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
 // RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
 
 #include <x86intrin.h>
 
+// CIR-LABEL: @__rdpmc
+// CIR: cir.call_llvm_intrinsic "x86.rdpmc"
+// CIR: cir.cast integral %{{.*}} : !s64i -> !u64i
+
 unsigned long long test_rdpmc(int a) {
     // CIR-LABEL: test_rdpmc
     // CIR: cir.call @__rdpmc

>From e8cd250d9f5cf4545a4324b498e253137a67fc82 Mon Sep 17 00:00:00 2001
From: Priyanshu <10b.priyanshu at gmail.com>
Date: Tue, 3 Feb 2026 19:26:22 +0530
Subject: [PATCH 03/13] Adress reviews and update test

---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       | 122 +++++++-----------
 .../CIR/CodeGenBuiltins/X86/rd-builtins.c     |   3 +-
 2 files changed, 47 insertions(+), 78 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 5fd70df43a5f6..2453489b67668 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -727,8 +727,10 @@ static RValue tryEmitFPMathIntrinsic(CIRGenFunction &cgf, const CallExpr *e,
   return RValue::getIgnored();
 }
 
+// FIXME: Remove cgf parameter when all descriptor kinds are implemented
 static mlir::Type
-decodeFixedType(ArrayRef<llvm::Intrinsic::IITDescriptor> &infos,
+decodeFixedType(CIRGenFunction &cgf,
+                ArrayRef<llvm::Intrinsic::IITDescriptor> &infos,
                 mlir::MLIRContext *context) {
   using namespace llvm::Intrinsic;
 
@@ -738,6 +740,8 @@ decodeFixedType(ArrayRef<llvm::Intrinsic::IITDescriptor> &infos,
   switch (descriptor.Kind) {
   case IITDescriptor::Void:
     return cir::VoidType::get(context);
+  // If the intrinsic expects unsigned integers, the signedness is corrected in
+  // correctIntegerSignedness()
   case IITDescriptor::Integer:
     return cir::IntType::get(context, descriptor.Integer_Width,
                              /*isSigned=*/true);
@@ -746,39 +750,23 @@ decodeFixedType(ArrayRef<llvm::Intrinsic::IITDescriptor> &infos,
   case IITDescriptor::Double:
     return cir::DoubleType::get(context);
   default:
-    llvm_unreachable("NYI");
+    cgf.cgm.errorNYI("Unimplemented intrinsic type descriptor");
+    return cir::VoidType::get(context);
   }
 }
 
-/// Helper function to correct integer signedness for intrinsic arguments.
-/// IIT always returns signed integers, but the actual intrinsic may expect
-/// unsigned integers based on the AST FunctionDecl parameter types.
-static mlir::Type getIntrinsicArgumentTypeFromAST(mlir::Type iitType,
-                                                  const CallExpr *E,
-                                                  unsigned argIndex,
-                                                  mlir::MLIRContext *context) {
-  // If it's not an integer type, return as-is
+/// Helper function to correct integer signedness for intrinsic arguments and
+/// return type. IIT always returns signed integers, but the actual intrinsic
+/// may expect unsigned integers based on the AST FunctionDecl parameter types.
+static mlir::Type correctIntegerSignedness(mlir::Type iitType, QualType astType,
+                                           mlir::MLIRContext *context) {
   auto intTy = dyn_cast<cir::IntType>(iitType);
   if (!intTy)
     return iitType;
 
-  // Get the FunctionDecl from the CallExpr
-  const FunctionDecl *FD = nullptr;
-  if (const auto *DRE =
-          dyn_cast<DeclRefExpr>(E->getCallee()->IgnoreImpCasts())) {
-    FD = dyn_cast<FunctionDecl>(DRE->getDecl());
-  }
-
-  // If we have FunctionDecl and this argument exists, check its signedness
-  if (FD && argIndex < FD->getNumParams()) {
-    QualType paramType = FD->getParamDecl(argIndex)->getType();
-    if (paramType->isUnsignedIntegerType()) {
-      // Create unsigned version of the type
-      return cir::IntType::get(context, intTy.getWidth(), /*isSigned=*/false);
-    }
+  if (astType->isUnsignedIntegerType()) {
+    return cir::IntType::get(context, intTy.getWidth(), /*isSigned=*/false);
   }
-
-  // Default: keep IIT type (signed)
   return iitType;
 }
 
@@ -788,8 +776,7 @@ static mlir::Value getCorrectedPtr(mlir::Value argValue, mlir::Type expectedTy,
   assert(ptrType && "expected pointer type");
 
   auto expectedPtrType = mlir::cast<cir::PointerType>(expectedTy);
-  assert(ptrType.getPointee() != expectedPtrType.getPointee() &&
-         "types should not match");
+  assert(ptrType != expectedPtrType && "types should not match");
 
   if (ptrType.getAddrSpace() != expectedPtrType.getAddrSpace()) {
     auto newPtrType = cir::PointerType::get(ptrType.getPointee(),
@@ -797,10 +784,11 @@ static mlir::Value getCorrectedPtr(mlir::Value argValue, mlir::Type expectedTy,
     return builder.createAddrSpaceCast(argValue, newPtrType);
   }
 
-  return argValue;
+  return builder.createBitcast(argValue, expectedTy);
 }
 
-static cir::FuncType getIntrinsicType(mlir::MLIRContext *context,
+static cir::FuncType getIntrinsicType(CIRGenFunction &cgf,
+                                      mlir::MLIRContext *context,
                                       llvm::Intrinsic::ID id) {
   using namespace llvm::Intrinsic;
 
@@ -808,17 +796,18 @@ static cir::FuncType getIntrinsicType(mlir::MLIRContext *context,
   getIntrinsicInfoTableEntries(id, table);
 
   ArrayRef<IITDescriptor> tableRef = table;
-  mlir::Type resultTy = decodeFixedType(tableRef, context);
+  mlir::Type resultTy = decodeFixedType(cgf, tableRef, context);
 
   SmallVector<mlir::Type, 8> argTypes;
   bool isVarArg = false;
   while (!tableRef.empty()) {
-    auto kind = tableRef.front().Kind;
+    llvm::Intrinsic::IITDescriptor::IITDescriptorKind kind =
+        tableRef.front().Kind;
     if (kind == IITDescriptor::VarArg) {
       isVarArg = true;
       break; // VarArg is last
     }
-    argTypes.push_back(decodeFixedType(tableRef, context));
+    argTypes.push_back(decodeFixedType(cgf, tableRef, context));
   }
 
   // CIR convention: no explicit void return type
@@ -836,8 +825,12 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
 
   // See if we can constant fold this builtin.  If so, don't emit it at all.
   // TODO: Extend this handling to all builtin calls that we can constant-fold.
+  // Do not constant-fold immediate (target-specific) builtins; their ASTs can
+  // trigger the constant evaluator in cases it cannot safely handle.
+  // Skip EvaluateAsRValue for those.
   Expr::EvalResult result;
-  if (e->isPRValue() && e->EvaluateAsRValue(result, cgm.getASTContext()) &&
+  if (e->isPRValue() && !getContext().BuiltinInfo.isImmediate(builtinID) &&
+      e->EvaluateAsRValue(result, cgm.getASTContext()) &&
       !result.hasSideEffects()) {
     if (result.Val.isInt())
       return RValue::get(builder.getConstInt(loc, result.Val.getInt()));
@@ -1947,9 +1940,10 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
       assert(false && "bad intrinsic name!");
 
     cir::FuncType intrinsicType =
-        getIntrinsicType(&getMLIRContext(), intrinsicID);
+        getIntrinsicType(*this, &getMLIRContext(), intrinsicID);
 
     SmallVector<mlir::Value> args;
+    const FunctionDecl *fd = e->getDirectCallee();
     for (unsigned i = 0; i < e->getNumArgs(); i++) {
       mlir::Value argValue =
           emitScalarOrConstFoldImmArg(iceArguments, i, e->getArg(i));
@@ -1958,9 +1952,12 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
       mlir::Type argType = argValue.getType();
       mlir::Type expectedTy = intrinsicType.getInput(i);
 
-      // Use helper to get the correct integer type based on AST signedness
-      mlir::Type correctedExpectedTy =
-          getIntrinsicArgumentTypeFromAST(expectedTy, e, i, &getMLIRContext());
+      // Correct integer signedness based on AST parameter type
+      mlir::Type correctedExpectedTy = expectedTy;
+      if (fd && i < fd->getNumParams()) {
+        correctedExpectedTy = correctIntegerSignedness(
+            expectedTy, fd->getParamDecl(i)->getType(), &getMLIRContext());
+      }
 
       if (argType != correctedExpectedTy)
         argValue = getCorrectedPtr(argValue, expectedTy, builder);
@@ -1968,49 +1965,22 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
       args.push_back(argValue);
     }
 
+    // Correct return type signedness based on AST return type before creating
+    // the call, avoiding unnecessary casts in the IR.
+    mlir::Type correctedReturnType = intrinsicType.getReturnType();
+    if (fd) {
+      correctedReturnType =
+          correctIntegerSignedness(intrinsicType.getReturnType(),
+                                   fd->getReturnType(), &getMLIRContext());
+    }
+
     cir::LLVMIntrinsicCallOp intrinsicCall = cir::LLVMIntrinsicCallOp::create(
         builder, getLoc(e->getExprLoc()), builder.getStringAttr(name),
-        intrinsicType.getReturnType(), args);
+        correctedReturnType, args);
 
-    // Convert the intrinsic result to the CallExpr/AST expected return type if
-    // they differ. This can happen when an intrinsic's IIT uses a signed
-    // integer type while the AST declares an unsigned type, or when an
-    // intrinsic returns an integer but the AST expects a pointer (or vice
-    // versa). Coerce conservatively so subsequent stores/verifications succeed.
     mlir::Value intrinsicRes = intrinsicCall.getResult();
-    mlir::Type builtinReturnType = intrinsicRes.getType();
-    mlir::Type expectedRetTy = convertType(e->getType());
-
-    if (builtinReturnType != expectedRetTy) {
-      // Integer -> Integer or width/signage differences.
-      if (cir::IntType fromInt =
-              mlir::dyn_cast<cir::IntType>(builtinReturnType)) {
-        if (cir::IntType toInt = mlir::dyn_cast<cir::IntType>(expectedRetTy))
-          intrinsicRes = builder.createIntCast(intrinsicRes, expectedRetTy);
-        else if (mlir::dyn_cast<cir::PointerType>(expectedRetTy))
-          intrinsicRes = builder.createIntToPtr(intrinsicRes, expectedRetTy);
-        else
-          intrinsicRes = builder.createBitcast(intrinsicRes, expectedRetTy);
-
-      } else if (cir::PointerType fromPtr =
-                     mlir::dyn_cast<cir::PointerType>(builtinReturnType)) {
-        if (mlir::dyn_cast<cir::IntType>(expectedRetTy))
-          intrinsicRes = builder.createPtrToInt(intrinsicRes, expectedRetTy);
-        else if (cir::PointerType toPtr =
-                     mlir::dyn_cast<cir::PointerType>(expectedRetTy)) {
-          if (fromPtr.getAddrSpace() != toPtr.getAddrSpace())
-            intrinsicRes =
-                builder.createAddrSpaceCast(intrinsicRes, expectedRetTy);
-          else if (fromPtr.getPointee() != toPtr.getPointee())
-            intrinsicRes = builder.createBitcast(intrinsicRes, expectedRetTy);
-        } else
-          intrinsicRes = builder.createBitcast(intrinsicRes, expectedRetTy);
-
-      } else
-        intrinsicRes = builder.createBitcast(intrinsicRes, expectedRetTy);
-    }
 
-    if (isa<cir::VoidType>(expectedRetTy))
+    if (isa<cir::VoidType>(correctedReturnType))
       return RValue::get(nullptr);
 
     return RValue::get(intrinsicRes);
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c
index a66302c50cec5..28d4d6f06ddd1 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/rd-builtins.c
@@ -8,8 +8,7 @@
 #include <x86intrin.h>
 
 // CIR-LABEL: @__rdpmc
-// CIR: cir.call_llvm_intrinsic "x86.rdpmc"
-// CIR: cir.cast integral %{{.*}} : !s64i -> !u64i
+// CIR: cir.call_llvm_intrinsic "x86.rdpmc" %{{.*}} : (!s32i) -> !u64i
 
 unsigned long long test_rdpmc(int a) {
     // CIR-LABEL: test_rdpmc

>From 016ea30078db8bdcb3818d4abde0fc9f9ea2d6fe Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <10b.priyanshu at gmail.com>
Date: Thu, 5 Feb 2026 00:19:57 +0530
Subject: [PATCH 04/13] Apply suggestion from @andykaylor

Co-authored-by: Andy Kaylor <akaylor at nvidia.com>
---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 2453489b67668..a32bbeb749caf 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1936,8 +1936,8 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     // cir::LLVMIntrinsicCallOp expects intrinsic name to not have prefix
     // "llvm." For example, `llvm.nvvm.barrier0` should be passed as
     // `nvvm.barrier0`.
-    if (!name.consume_front("llvm."))
-      assert(false && "bad intrinsic name!");
+    assert(name.starts_with("llvm.");
+    name = name.drop_front(/*strlen("llvm.")=*/5);
 
     cir::FuncType intrinsicType =
         getIntrinsicType(*this, &getMLIRContext(), intrinsicID);

>From 50afc771fd753edc07a13fa9d21216664b412953 Mon Sep 17 00:00:00 2001
From: Priyanshu <10b.priyanshu at gmail.com>
Date: Thu, 5 Feb 2026 01:07:49 +0530
Subject: [PATCH 05/13] Fix syntax error

---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index a32bbeb749caf..4c0bb82848d4d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1936,7 +1936,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     // cir::LLVMIntrinsicCallOp expects intrinsic name to not have prefix
     // "llvm." For example, `llvm.nvvm.barrier0` should be passed as
     // `nvvm.barrier0`.
-    assert(name.starts_with("llvm.");
+    assert(name.starts_with("llvm.") && "expected llvm. prefix");
     name = name.drop_front(/*strlen("llvm.")=*/5);
 
     cir::FuncType intrinsicType =

>From 80431425ac91b035e4095eb4d067d0b72278c73f Mon Sep 17 00:00:00 2001
From: Priyanshu <10b.priyanshu at gmail.com>
Date: Thu, 5 Feb 2026 11:26:14 +0530
Subject: [PATCH 06/13] Add test

---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp |  2 ++
 clang/test/CIR/CodeGen/builtins-x86.c   | 36 +++++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 clang/test/CIR/CodeGen/builtins-x86.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 4c0bb82848d4d..0da9e3189f785 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -779,6 +779,8 @@ static mlir::Value getCorrectedPtr(mlir::Value argValue, mlir::Type expectedTy,
   assert(ptrType != expectedPtrType && "types should not match");
 
   if (ptrType.getAddrSpace() != expectedPtrType.getAddrSpace()) {
+    assert(!cir::MissingFeatures::addressSpace() &&
+           "address space handling not yet implemented");
     auto newPtrType = cir::PointerType::get(ptrType.getPointee(),
                                             expectedPtrType.getAddrSpace());
     return builder.createAddrSpaceCast(argValue, newPtrType);
diff --git a/clang/test/CIR/CodeGen/builtins-x86.c b/clang/test/CIR/CodeGen/builtins-x86.c
new file mode 100644
index 0000000000000..0748147bd5b57
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtins-x86.c
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t-ogcg.ll
+// RUN: FileCheck --input-file=%t-ogcg.ll %s -check-prefix=OGCG
+
+void test_sfence(void) {
+  // CIR-LABEL: @test_sfence
+  // CIR: cir.call_llvm_intrinsic "x86.sse.sfence"  : () -> !void
+  // LLVM-LABEL: @test_sfence
+  // LLVM: call void @llvm.x86.sse.sfence
+  // OGCG-LABEL: @test_sfence
+  // OGCG: call void @llvm.x86.sse.sfence
+  __builtin_ia32_sfence();
+}
+
+// CIR-LABEL: @test_lfence
+void test_lfence(void) {
+  // CIR: cir.call_llvm_intrinsic "x86.sse2.lfence"  : () -> !void
+  // LLVM-LABEL: @test_lfence
+  // LLVM: call void @llvm.x86.sse2.lfence()
+  // OGCG-LABEL: @test_lfence
+  // OGCG: call void @llvm.x86.sse2.lfence()
+  __builtin_ia32_lfence();
+}
+
+void test_pause(void) {
+  // CIR-LABEL: @test_pause
+  // CIR: cir.call_llvm_intrinsic "x86.sse2.pause"  : () -> !void
+  // LLVM-LABEL: @test_pause
+  // LLVM: call void @llvm.x86.sse2.pause()
+  // OGCG-LABEL: @test_pause
+  // OGCG: call void @llvm.x86.sse2.pause()
+  __builtin_ia32_pause();
+}

>From 591473f146bff182d403ce2baab8e3df0c80d619 Mon Sep 17 00:00:00 2001
From: Priyanshu <10b.priyanshu at gmail.com>
Date: Thu, 5 Feb 2026 11:34:56 +0530
Subject: [PATCH 07/13] Update test

---
 clang/test/CIR/CodeGen/builtins-x86.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/clang/test/CIR/CodeGen/builtins-x86.c b/clang/test/CIR/CodeGen/builtins-x86.c
index 0748147bd5b57..e03cd64e063dc 100644
--- a/clang/test/CIR/CodeGen/builtins-x86.c
+++ b/clang/test/CIR/CodeGen/builtins-x86.c
@@ -8,18 +8,22 @@
 void test_sfence(void) {
   // CIR-LABEL: @test_sfence
   // CIR: cir.call_llvm_intrinsic "x86.sse.sfence"  : () -> !void
+
   // LLVM-LABEL: @test_sfence
   // LLVM: call void @llvm.x86.sse.sfence
+
   // OGCG-LABEL: @test_sfence
   // OGCG: call void @llvm.x86.sse.sfence
   __builtin_ia32_sfence();
 }
 
-// CIR-LABEL: @test_lfence
 void test_lfence(void) {
+  // CIR-LABEL: @test_lfence
   // CIR: cir.call_llvm_intrinsic "x86.sse2.lfence"  : () -> !void
+
   // LLVM-LABEL: @test_lfence
   // LLVM: call void @llvm.x86.sse2.lfence()
+
   // OGCG-LABEL: @test_lfence
   // OGCG: call void @llvm.x86.sse2.lfence()
   __builtin_ia32_lfence();
@@ -28,8 +32,10 @@ void test_lfence(void) {
 void test_pause(void) {
   // CIR-LABEL: @test_pause
   // CIR: cir.call_llvm_intrinsic "x86.sse2.pause"  : () -> !void
+  
   // LLVM-LABEL: @test_pause
   // LLVM: call void @llvm.x86.sse2.pause()
+
   // OGCG-LABEL: @test_pause
   // OGCG: call void @llvm.x86.sse2.pause()
   __builtin_ia32_pause();

>From bcb49de67658a61b1fd3b2062e65a62a481e96fb Mon Sep 17 00:00:00 2001
From: Priyanshu <10b.priyanshu at gmail.com>
Date: Sun, 8 Feb 2026 10:11:26 +0530
Subject: [PATCH 08/13] Update CIRGenBuiltin.cpp

---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 0da9e3189f785..6fc48f59e9d02 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -764,9 +764,9 @@ static mlir::Type correctIntegerSignedness(mlir::Type iitType, QualType astType,
   if (!intTy)
     return iitType;
 
-  if (astType->isUnsignedIntegerType()) {
+  if (astType->isUnsignedIntegerType())
     return cir::IntType::get(context, intTy.getWidth(), /*isSigned=*/false);
-  }
+
   return iitType;
 }
 
@@ -1953,6 +1953,11 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
       // we need to do a bit cast.
       mlir::Type argType = argValue.getType();
       mlir::Type expectedTy = intrinsicType.getInput(i);
+      if (!mlir::isa<cir::PointerType>(expectedTy)) {
+        cgm.errorNYI(e->getSourceRange(),
+                     "intrinsic expects a pointer type (NYI for non-pointer)");
+        return getUndefRValue(e->getType());
+      }
 
       // Correct integer signedness based on AST parameter type
       mlir::Type correctedExpectedTy = expectedTy;

>From 1a92a0d180cbec053b14c02aaff7209abb7d604d Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <10b.priyanshu at gmail.com>
Date: Sun, 8 Feb 2026 10:30:22 +0530
Subject: [PATCH 09/13] Remove support for float/double arguments

Removed handling for Float and Double types in intrinsic type descriptor.
---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 6fc48f59e9d02..27cb87ef01498 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -745,10 +745,6 @@ decodeFixedType(CIRGenFunction &cgf,
   case IITDescriptor::Integer:
     return cir::IntType::get(context, descriptor.Integer_Width,
                              /*isSigned=*/true);
-  case IITDescriptor::Float:
-    return cir::SingleType::get(context);
-  case IITDescriptor::Double:
-    return cir::DoubleType::get(context);
   default:
     cgf.cgm.errorNYI("Unimplemented intrinsic type descriptor");
     return cir::VoidType::get(context);

>From b1370f8a8c8fb165d5281a6e75b94549a8e2fab5 Mon Sep 17 00:00:00 2001
From: Priyanshu <10b.priyanshu at gmail.com>
Date: Sun, 8 Feb 2026 13:38:01 +0530
Subject: [PATCH 10/13] Update CIRGenBuiltin.cpp

---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 30 +++++++++++++++++++------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 27cb87ef01498..c1c45d40f8296 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1949,11 +1949,6 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
       // we need to do a bit cast.
       mlir::Type argType = argValue.getType();
       mlir::Type expectedTy = intrinsicType.getInput(i);
-      if (!mlir::isa<cir::PointerType>(expectedTy)) {
-        cgm.errorNYI(e->getSourceRange(),
-                     "intrinsic expects a pointer type (NYI for non-pointer)");
-        return getUndefRValue(e->getType());
-      }
 
       // Correct integer signedness based on AST parameter type
       mlir::Type correctedExpectedTy = expectedTy;
@@ -1962,8 +1957,29 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
             expectedTy, fd->getParamDecl(i)->getType(), &getMLIRContext());
       }
 
-      if (argType != correctedExpectedTy)
-        argValue = getCorrectedPtr(argValue, expectedTy, builder);
+      if (mlir::isa<cir::PointerType>(expectedTy)) {
+        bool argIsPointer = mlir::isa<cir::PointerType>(argType);
+        bool argIsVectorOfPointer = false;
+        if (auto vecTy = dyn_cast<mlir::VectorType>(argType))
+          argIsVectorOfPointer =
+              mlir::isa<cir::PointerType>(vecTy.getElementType());
+
+        if (!argIsPointer && !argIsVectorOfPointer) {
+          cgm.errorNYI(
+              e->getSourceRange(),
+              "intrinsic expects a pointer type (NYI for non-pointer)");
+          return getUndefRValue(e->getType());
+        }
+
+        // Pointer handling (address-space cast / bitcast fallback).
+        if (argType != expectedTy)
+          argValue = getCorrectedPtr(argValue, expectedTy, builder);
+      } else {
+        // Non-pointer expected type: if needed, bitcast to the corrected
+        // expected type to match signedness/representation.
+        if (argType != correctedExpectedTy)
+          argValue = builder.createBitcast(argValue, correctedExpectedTy);
+      }
 
       args.push_back(argValue);
     }

>From 4a8a5bfd85e151d0ed401a230ec9a3c40f7b5934 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <10b.priyanshu at gmail.com>
Date: Tue, 10 Feb 2026 23:25:10 +0530
Subject: [PATCH 11/13] Update clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp

Co-authored-by: Andy Kaylor <akaylor at nvidia.com>
---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index c1c45d40f8296..b6ce5207039b7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -768,8 +768,7 @@ static mlir::Type correctIntegerSignedness(mlir::Type iitType, QualType astType,
 
 static mlir::Value getCorrectedPtr(mlir::Value argValue, mlir::Type expectedTy,
                                    CIRGenBuilderTy &builder) {
-  auto ptrType = mlir::dyn_cast<cir::PointerType>(argValue.getType());
-  assert(ptrType && "expected pointer type");
+  auto ptrType = mlir::cast<cir::PointerType>(argValue.getType());
 
   auto expectedPtrType = mlir::cast<cir::PointerType>(expectedTy);
   assert(ptrType != expectedPtrType && "types should not match");

>From 660efb74af20fd20f04836128c6eeca0c4a655d1 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <10b.priyanshu at gmail.com>
Date: Wed, 11 Feb 2026 05:59:23 +0530
Subject: [PATCH 12/13] [CIR][X86] Add support for vpshl/vpshr builtins
 (#179538)

This patch also adds support for fshl/fshr operations so that
vpshl/vpshr intrinsics can lower to them
Part of: #167765
---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       |  19 +-
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  13 +-
 .../X86/avx512vbmi2-builtins.c                | 401 ++++++++++++++++++
 .../CodeGenBuiltins/builtins-elementwise.c    |  87 ++++
 4 files changed, 514 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index b6ce5207039b7..1b479dde8b718 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1339,8 +1339,23 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_elementwise_canonicalize:
   case Builtin::BI__builtin_elementwise_copysign:
   case Builtin::BI__builtin_elementwise_fma:
-  case Builtin::BI__builtin_elementwise_fshl:
-  case Builtin::BI__builtin_elementwise_fshr:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_elementwise_fshl: {
+    mlir::Location loc = getLoc(e->getExprLoc());
+    mlir::Value a = emitScalarExpr(e->getArg(0));
+    mlir::Value b = emitScalarExpr(e->getArg(1));
+    mlir::Value c = emitScalarExpr(e->getArg(2));
+    return RValue::get(builder.emitIntrinsicCallOp(loc, "fshl", a.getType(),
+                                                   mlir::ValueRange{a, b, c}));
+  }
+  case Builtin::BI__builtin_elementwise_fshr: {
+    mlir::Location loc = getLoc(e->getExprLoc());
+    mlir::Value a = emitScalarExpr(e->getArg(0));
+    mlir::Value b = emitScalarExpr(e->getArg(1));
+    mlir::Value c = emitScalarExpr(e->getArg(2));
+    return RValue::get(builder.emitIntrinsicCallOp(loc, "fshr", a.getType(),
+                                                   mlir::ValueRange{a, b, c}));
+  }
   case Builtin::BI__builtin_elementwise_add_sat:
   case Builtin::BI__builtin_elementwise_sub_sat:
   case Builtin::BI__builtin_elementwise_max:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index cad80317cb870..7800e90d130b5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -2058,6 +2058,10 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_pternlogd256_maskz:
   case X86::BI__builtin_ia32_pternlogq128_maskz:
   case X86::BI__builtin_ia32_pternlogq256_maskz:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
   case X86::BI__builtin_ia32_vpshldd128:
   case X86::BI__builtin_ia32_vpshldd256:
   case X86::BI__builtin_ia32_vpshldd512:
@@ -2067,6 +2071,8 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_vpshldw128:
   case X86::BI__builtin_ia32_vpshldw256:
   case X86::BI__builtin_ia32_vpshldw512:
+    return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[0],
+                              ops[1], ops[2], false);
   case X86::BI__builtin_ia32_vpshrdd128:
   case X86::BI__builtin_ia32_vpshrdd256:
   case X86::BI__builtin_ia32_vpshrdd512:
@@ -2076,10 +2082,9 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_vpshrdw128:
   case X86::BI__builtin_ia32_vpshrdw256:
   case X86::BI__builtin_ia32_vpshrdw512:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented X86 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
+    // Ops 0 and 1 are swapped.
+    return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[1],
+                              ops[0], ops[2], true);
   case X86::BI__builtin_ia32_reduce_fadd_pd512:
   case X86::BI__builtin_ia32_reduce_fadd_ps512:
   case X86::BI__builtin_ia32_reduce_fadd_ph512:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c
new file mode 100644
index 0000000000000..170c6fd48ac81
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c
@@ -0,0 +1,401 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding -triple x86_64-unknown-linux-gnu -fclangir -target-feature +avx512vbmi2 -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding -triple x86_64-unknown-linux-gnu -fclangir -target-feature +avx512vbmi2 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vbmi2 -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=OGCG --input-file=%t.ll %s
+
+
+#include <immintrin.h>
+
+__m512i test_mm512_shldv_epi64(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @_mm512_shldv_epi64
+  // CIR: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s64i> -> !cir.vector<8 x !u64i>
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !u64i>, !cir.vector<8 x !u64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !u64i>
+  // CIR: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.vector<8 x !u64i> -> !cir.vector<8 x !s64i>
+  // CIR-LABEL: @test_mm512_shldv_epi64
+  // CIR: %{{.*}} = cir.call @_mm512_shldv_epi64
+  // LLVM-LABEL: @test_mm512_shldv_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_shldv_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  return _mm512_shldv_epi64(s, a, b);
+}
+
+__m512i test_mm512_mask_shldi_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: test_mm512_mask_shldi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}}
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}})
+  // LLVM-LABEL: @test_mm512_mask_shldi_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 47))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shldi_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 47))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shldi_epi64(s, u, a, b, 47);
+}
+
+__m512i test_mm512_maskz_shldi_epi64(__mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: test_mm512_maskz_shldi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shldi_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 63))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shldi_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 63))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shldi_epi64(u, a, b, 63);
+}
+
+__m512i test_mm512_shldi_epi64(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shldi_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 31))
+  // OGCG-LABEL: @test_mm512_shldi_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 31))
+  return _mm512_shldi_epi64(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldi_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: test_mm512_mask_shldi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shldi_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 7))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shldi_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 7))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shldi_epi32(s, u, a, b, 7);
+}
+
+__m512i test_mm512_maskz_shldi_epi32(__mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: test_mm512_maskz_shldi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shldi_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 15))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shldi_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 15))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shldi_epi32(u, a, b, 15);
+}
+
+__m512i test_mm512_shldi_epi32(__m512i a, __m512i b) {
+  // CIR-LABEL: test_mm512_shldi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shldi_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 31))
+  // OGCG-LABEL: @test_mm512_shldi_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 31))
+  return _mm512_shldi_epi32(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldi_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: test_mm512_mask_shldi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<32 x !s16i> -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shldi_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 3))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shldi_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 3))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shldi_epi16(s, u, a, b, 3);
+}
+
+__m512i test_mm512_maskz_shldi_epi16(__mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: test_mm512_maskz_shldi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<32 x !s16i> -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shldi_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 15))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_maskz_shldi_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 15))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shldi_epi16(u, a, b, 15);
+}
+
+__m512i test_mm512_shldi_epi16(__m512i a, __m512i b) {
+  // CIR-LABEL: test_mm512_shldi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<32 x !s16i> -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shldi_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 31))
+  // OGCG-LABEL: @test_mm512_shldi_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 31))
+  return _mm512_shldi_epi16(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldv_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: _mm512_mask_shldv_epi64
+  // CIR: cir.call @_mm512_shldv_epi64(%{{.*}}, %{{.*}}, %{{.*}}){{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.cast bitcast %{{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // CIR-LABEL: test_mm512_mask_shldv_epi64
+  // CIR: cir.call @_mm512_mask_shldv_epi64
+  // LLVM-LABEL: @test_mm512_mask_shldv_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shldv_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shldv_epi64(s, u, a, b);
+}
+
+__m512i test_mm512_shldv_epi32(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: _mm512_shldv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !u32i>, !cir.vector<16 x !u32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !u32i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<16 x !u32i> -> !cir.vector<8 x !s64i>
+  // CIR-LABEL: test_mm512_shldv_epi32
+  // CIR: cir.call @_mm512_shldv_epi32
+  // LLVM-LABEL: @test_mm512_shldv_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_shldv_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  return _mm512_shldv_epi32(s, a, b);
+}
+
+__m512i test_mm512_mask_shldv_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @_mm512_mask_shldv_epi16
+  // CIR: cir.call @_mm512_shldv_epi16(%{{.*}}, %{{.*}}, %{{.*}}){{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<8 x !s64i> -> !cir.vector<32 x !s16i>
+  // CIR-LABEL: @test_mm512_mask_shldv_epi16
+  // CIR: cir.call @_mm512_mask_shldv_epi16
+  // LLVM-LABEL: @test_mm512_mask_shldv_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shldv_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shldv_epi16(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shldv_epi16(__mmask32 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: _mm512_maskz_shldv_epi16
+  // CIR: cir.call @_mm512_shldv_epi16(%{{.*}}, %{{.*}}, %{{.*}}){{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<8 x !s64i> -> !cir.vector<32 x !s16i>
+  // CIR-LABEL: @test_mm512_maskz_shldv_epi16
+  // CIR: cir.call @_mm512_maskz_shldv_epi16
+  // LLVM-LABEL: @test_mm512_maskz_shldv_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_maskz_shldv_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shldv_epi16(u, s, a, b);
+}
+
+__m512i test_mm512_shldv_epi16(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: _mm512_shldv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}}{{.*}} : (!cir.vector<32 x !u16i>, !cir.vector<32 x !u16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !u16i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<32 x !u16i> -> !cir.vector<8 x !s64i>
+  // CIR-LABEL: @test_mm512_shldv_epi16
+  // CIR: cir.call @_mm512_shldv_epi16
+  // LLVM-LABEL: @test_mm512_shldv_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_shldv_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  return _mm512_shldv_epi16(s, a, b);
+}
+
+__m512i test_mm512_mask_shrdi_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shrdi_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 47))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shrdi_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 47))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shrdi_epi64(s, u, a, b, 47);
+}
+
+__m512i test_mm512_maskz_shrdi_epi64(__mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.call @_mm512_setzero_si512() {{.*}} : () -> !cir.vector<8 x !s64i>
+  // CIR: cir.cast bitcast %{{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdi_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 63))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shrdi_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 63))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shrdi_epi64(u, a, b, 63);
+}
+
+__m512i test_mm512_shrdi_epi64(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shrdi_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 31))
+  // OGCG-LABEL: @test_mm512_shrdi_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 31))
+  return _mm512_shrdi_epi64(a, b, 31);
+}
+
+__m512i test_mm512_mask_shrdi_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_mask_shrdi_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 7))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shrdi_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 7))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shrdi_epi32(s, u, a, b, 7);
+}
+
+__m512i test_mm512_maskz_shrdi_epi32(__mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdi_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 15))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shrdi_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 15))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shrdi_epi32(u, a, b, 15);
+}
+
+__m512i test_mm512_shrdi_epi32(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shrdi_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 31))
+  // OGCG-LABEL: @test_mm512_shrdi_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 31))
+  return _mm512_shrdi_epi32(a, b, 31);
+}
+
+__m512i test_mm512_mask_shrdi_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<32 x !s16i> -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_mask_shrdi_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 3))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shrdi_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 3))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shrdi_epi16(s, u, a, b, 3);
+}
+
+__m512i test_mm512_maskz_shrdi_epi16(__mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<32 x !s16i> -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdi_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 15))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_maskz_shrdi_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 15))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shrdi_epi16(u, a, b, 15);
+}
+
+__m512i test_mm512_shrdi_epi16(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<32 x !s16i> -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shrdi_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 31))
+  // OGCG-LABEL: @test_mm512_shrdi_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 31))
+  return _mm512_shrdi_epi16(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldv_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: _mm512_mask_shldv_epi32
+  // CIR: cir.call @_mm512_shldv_epi32(%{{.*}}, %{{.*}}, %{{.*}}){{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<8 x !s64i> -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // CIR-LABEL: test_mm512_mask_shldv_epi32
+  // CIR: cir.call @_mm512_mask_shldv_epi32
+  // LLVM-LABEL: @test_mm512_mask_shldv_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shldv_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shldv_epi32(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shldv_epi32(__mmask16 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: _mm512_maskz_shldv_epi32
+  // CIR: cir.call @_mm512_shldv_epi32(%{{.*}}, %{{.*}}, %{{.*}}){{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<8 x !s64i> -> !cir.vector<16 x !s32i>
+  // CIR: cir.call @_mm512_setzero_si512() {{.*}} : () -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // CIR-LABEL: test_mm512_maskz_shldv_epi32
+  // CIR: cir.call @_mm512_maskz_shldv_epi32
+  // LLVM-LABEL: @test_mm512_maskz_shldv_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shldv_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shldv_epi32(u, s, a, b);
+}
+
+__m512i test_mm512_mask_shrdv_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @_mm512_shrdv_epi32
+  // CIR: cir.call @_mm512_shrdv_epi32(%{{.*}}, %{{.*}}, %{{.*}}){{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<8 x !s64i> -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // CIR-LABEL: @test_mm512_mask_shrdv_epi32
+  // CIR: cir.call @_mm512_mask_shrdv_epi32
+  // LLVM-LABEL: @test_mm512_mask_shrdv_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shrdv_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shrdv_epi32(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shrdv_epi32(__mmask16 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: _mm512_maskz_shrdv_epi32
+  // CIR: cir.call @_mm512_shrdv_epi32(%{{.*}}, %{{.*}}, %{{.*}}){{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<8 x !s64i> -> !cir.vector<16 x !s32i>
+  // CIR: cir.call @_mm512_setzero_si512() {{.*}} : () -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // CIR-LABEL: test_mm512_maskz_shrdv_epi32
+  // CIR: cir.call @_mm512_maskz_shrdv_epi32
+  // LLVM-LABEL: @test_mm512_maskz_shrdv_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shrdv_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shrdv_epi32(u, s, a, b);
+}
+
+__m512i test_mm512_mask_shrdv_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: _mm512_mask_shrdv_epi16
+  // CIR: cir.call @_mm512_shrdv_epi16(%{{.*}}, %{{.*}}, %{{.*}}){{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.cast bitcast %{{.*}} : !cir.vector<8 x !s64i> -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // CIR-LABEL: test_mm512_mask_shrdv_epi16
+  // CIR: cir.call @_mm512_mask_shrdv_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shrdv_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shrdv_epi16(s, u, a, b);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/builtins-elementwise.c b/clang/test/CIR/CodeGenBuiltins/builtins-elementwise.c
index f64080b829bdf..80fc0682f8126 100644
--- a/clang/test/CIR/CodeGenBuiltins/builtins-elementwise.c
+++ b/clang/test/CIR/CodeGenBuiltins/builtins-elementwise.c
@@ -6,6 +6,7 @@
 // RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
 
 typedef int vint4 __attribute__((ext_vector_type(4)));
+typedef short vshort8 __attribute__((ext_vector_type(8)));
 typedef float vfloat4 __attribute__((ext_vector_type(4)));
 typedef double vdouble4 __attribute__((ext_vector_type(4)));
 
@@ -116,3 +117,89 @@ void test_builtin_elementwise_cos(float f, double d, vfloat4 vf4,
   // OGCG: {{%.*}} = call <4 x double> @llvm.cos.v4f64(<4 x double> {{%.*}})
   vd4 = __builtin_elementwise_cos(vd4);
 }
+
+void test_builtin_elementwise_fshl(long long int i1, long long int i2,
+                                   long long int i3, unsigned short us1,
+                                   unsigned short us2, unsigned short us3,
+                                   char c1, char c2, char c3,
+                                   unsigned char uc1, unsigned char uc2,
+                                   unsigned char uc3, vshort8 vi1,
+                                   vshort8 vi2, vshort8 vi3, vint4 vu1,
+                                   vint4 vu2, vint4 vu3) {
+  // CIR-LABEL: test_builtin_elementwise_fshl
+  // LLVM-LABEL: test_builtin_elementwise_fshl
+  // OGCG-LABEL: test_builtin_elementwise_fshl
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!s64i, !s64i, !s64i) -> !s64i
+  // LLVM: %{{.*}} = call i64 @llvm.fshl.i64(i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+  // OGCG: %{{.*}} = call i64 @llvm.fshl.i64(i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+  i1 = __builtin_elementwise_fshl(i1, i2, i3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!u16i, !u16i, !u16i) -> !u16i
+  // LLVM: %{{.*}} = call i16 @llvm.fshl.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+  // OGCG: %{{.*}} = call i16 @llvm.fshl.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+  us1 = __builtin_elementwise_fshl(us1, us2, us3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!s8i, !s8i, !s8i) -> !s8i
+  // LLVM: %{{.*}} = call i8 @llvm.fshl.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+  // OGCG: %{{.*}} = call i8 @llvm.fshl.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+  c1 = __builtin_elementwise_fshl(c1, c2, c3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!u8i, !u8i, !u8i) -> !u8i
+  // LLVM: %{{.*}} = call i8 @llvm.fshl.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+  // OGCG: %{{.*}} = call i8 @llvm.fshl.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+  uc1 = __builtin_elementwise_fshl(uc1, uc2, uc3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>) -> !cir.vector<8 x !s16i>
+  // LLVM: %{{.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // OGCG: %{{.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  vi1 = __builtin_elementwise_fshl(vi1, vi2, vi3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+  // LLVM: %{{.*}} = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // OGCG: %{{.*}} = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  vu1 = __builtin_elementwise_fshl(vu1, vu2, vu3);
+}
+
+void test_builtin_elementwise_fshr(long long int i1, long long int i2,
+                                   long long int i3, unsigned short us1,
+                                   unsigned short us2, unsigned short us3,
+                                   char c1, char c2, char c3,
+                                   unsigned char uc1, unsigned char uc2,
+                                   unsigned char uc3, vshort8 vi1,
+                                   vshort8 vi2, vshort8 vi3, vint4 vu1,
+                                   vint4 vu2, vint4 vu3) {
+  // CIR-LABEL: test_builtin_elementwise_fshr
+  // LLVM-LABEL: test_builtin_elementwise_fshr
+  // OGCG-LABEL: test_builtin_elementwise_fshr
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!s64i, !s64i, !s64i) -> !s64i
+  // LLVM: %{{.*}} = call i64 @llvm.fshr.i64(i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+  // OGCG: %{{.*}} = call i64 @llvm.fshr.i64(i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+  i1 = __builtin_elementwise_fshr(i1, i2, i3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!u16i, !u16i, !u16i) -> !u16i
+  // LLVM: %{{.*}} = call i16 @llvm.fshr.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+  // OGCG: %{{.*}} = call i16 @llvm.fshr.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+  us1 = __builtin_elementwise_fshr(us1, us2, us3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!s8i, !s8i, !s8i) -> !s8i
+  // LLVM: %{{.*}} = call i8 @llvm.fshr.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+  // OGCG: %{{.*}} = call i8 @llvm.fshr.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+  c1 = __builtin_elementwise_fshr(c1, c2, c3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!u8i, !u8i, !u8i) -> !u8i
+  // LLVM: %{{.*}} = call i8 @llvm.fshr.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+  // OGCG: %{{.*}} = call i8 @llvm.fshr.i8(i8 %{{.*}}, i8 %{{.*}}, i8 %{{.*}})
+  uc1 = __builtin_elementwise_fshr(uc1, uc2, uc3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>) -> !cir.vector<8 x !s16i>
+  // LLVM: %{{.*}} = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // OGCG: %{{.*}} = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  vi1 = __builtin_elementwise_fshr(vi1, vi2, vi3);
+
+  // CIR: %{{.*}} = cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+  // LLVM: %{{.*}} = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // OGCG: %{{.*}} = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  vu1 = __builtin_elementwise_fshr(vu1, vu2, vu3);
+}

>From 1de953cd5929f5a8c8249738e671ae862fbc083e Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <10b.priyanshu at gmail.com>
Date: Tue, 10 Feb 2026 23:25:10 +0530
Subject: [PATCH 13/13] Update clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp

Co-authored-by: Andy Kaylor <akaylor at nvidia.com>
---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 1b479dde8b718..9af00b805c37e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1924,6 +1924,12 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitLibraryCall(*this, fd, e,
                            cgm.getBuiltinLibFunction(fd, builtinID));
 
+  // If this is a predefined lib function (e.g. malloc), emit the call
+  // using exactly the normal call path.
+  if (getContext().BuiltinInfo.isPredefinedLibFunction(builtinID))
+    return emitLibraryCall(*this, fd, e,
+                           emitScalarExpr(e->getCallee()).getDefiningOp());
+
   // See if we have a target specific intrinsic.
   std::string name = getContext().BuiltinInfo.getName(builtinID);
   Intrinsic::ID intrinsicID = Intrinsic::not_intrinsic;