[Mlir-commits] [mlir] a40f444 - [NVPTX] Add support for barrier.cta.red.* instructions (#172541)

llvmlistbot at llvm.org llvmlistbot at llvm.org
Thu Dec 18 18:06:34 PST 2025


Author: Alex MacLean
Date: 2025-12-18T18:06:27-08:00
New Revision: a40f4442652c68a123e724dbf2d4a8f43a6b0f2d

URL: https://github.com/llvm/llvm-project/commit/a40f4442652c68a123e724dbf2d4a8f43a6b0f2d
DIFF: https://github.com/llvm/llvm-project/commit/a40f4442652c68a123e724dbf2d4a8f43a6b0f2d.diff

LOG: [NVPTX] Add support for barrier.cta.red.* instructions (#172541)

This change adds full support for the ptx `barrier.cta.red` instruction,
following the same conventions as are already used for
`barrier.cta.sync` and `barrier.cta.arrive`.

In addition this MR removes the following intrinsics which are no longer
needed:
* llvm.nvvm.barrier0.popc -->
  llvm.nvvm.barrier.cta.red.popc.aligned.all(0, c)
* llvm.nvvm.barrier0.and -->
  llvm.nvvm.barrier.cta.red.and.aligned.all(0, z)
* llvm.nvvm.barrier0.or -->
  llvm.nvvm.barrier.cta.red.or.aligned.all(0, z)

Added: 
    

Modified: 
    llvm/docs/NVPTXUsage.rst
    llvm/include/llvm/IR/IntrinsicsNVVM.td
    llvm/lib/IR/AutoUpgrade.cpp
    llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
    llvm/lib/Transforms/IPO/AttributorAttributes.cpp
    llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
    llvm/test/CodeGen/NVPTX/barrier.ll
    mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
    mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
    mlir/test/Target/LLVMIR/nvvm/barrier.mlir

Removed: 
    


################################################################################
diff  --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 5f7fb00889655..59a5c9c91e620 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -278,12 +278,25 @@ Syntax:
 
   declare void @llvm.nvvm.barrier.cta.sync.count(i32 %id, i32 %n)
   declare void @llvm.nvvm.barrier.cta.sync.all(i32 %id)
-  declare void @llvm.nvvm.barrier.cta.arrive.count(i32 %id, i32 %n)
-
   declare void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %id, i32 %n)
   declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
+
+  declare void @llvm.nvvm.barrier.cta.arrive.count(i32 %id, i32 %n)
   declare void @llvm.nvvm.barrier.cta.arrive.aligned.count(i32 %id, i32 %n)
 
+  declare i32 @llvm.nvvm.barrier.cta.red.popc.count(i32 %id, i32 %n, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.popc.all(i32 %id, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.popc.aligned.count(i32 %id, i32 %n, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.popc.aligned.all(i32 %id, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.and.count(i32 %id, i32 %n, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.and.all(i32 %id, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.and.aligned.count(i32 %id, i32 %n, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.and.aligned.all(i32 %id, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.or.count(i32 %id, i32 %n, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.or.all(i32 %id, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.or.aligned.count(i32 %id, i32 %n, i1 %pred)
+  declare i32 @llvm.nvvm.barrier.cta.red.or.aligned.all(i32 %id, i1 %pred)
+
 Overview:
 """""""""
 
@@ -305,15 +318,27 @@ the threads specified by the %n operand should participate in the barrier.
 All forms of the '``@llvm.nvvm.barrier.cta.*``' intrinsic cause the executing
 thread to wait for all non-exited threads from its warp and then marks the
 warp's arrival at the barrier. In addition to signaling its arrival at the 
-barrier, the '``@llvm.nvvm.barrier.cta.sync.*``' intrinsics cause the executing
-thread to wait for non-exited threads of all other warps participating in the
-barrier to arrive. On the other hand, the '``@llvm.nvvm.barrier.cta.arrive.*``'
-intrinsic does not cause the executing thread to wait for threads of other
-participating warps.
+barrier, the '``@llvm.nvvm.barrier.cta.red.*``' and
+'``@llvm.nvvm.barrier.cta.sync.*``' intrinsics cause the executing thread to 
+wait for non-exited threads of all other warps participating in the barrier to
+arrive. On the other hand, the '``@llvm.nvvm.barrier.cta.arrive.*``' intrinsic
+does not cause the executing thread to wait for threads of other participating
+warps.
 
 When a barrier completes, the waiting threads are restarted without delay,
 and the barrier is reinitialized so that it can be immediately reused.
 
+The '``@llvm.nvvm.barrier.cta.red.*``' intrinsics perform a reduction operation
+across threads. The %pred operands from all threads in the CTA are combined
+using the specified reduction operator. Once the barrier count is reached, the
+final value is returned in all threads waiting at the barrier.
+
+The reduction operations for '``@llvm.nvvm.barrier.cta.red.*``' are
+population-count ('``.popc``'), all-threads-true ('``.and``'), 
+and any-thread-true ('``.or``'). The result of '``.popc``' is the number of
+threads with a true predicate, while '``.and``' and '``.or``' indicate if all
+the threads had a true predicate or if any of the threads had a true predicate.
+
 The '``@llvm.nvvm.barrier.cta.*``' intrinsic has an optional '``.aligned``'
 modifier to indicate textual alignment of the barrier. When specified, it
 indicates that all threads in the CTA will execute the same

diff  --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index aab85c2a86373..bddbf4ea3c185 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -91,23 +91,23 @@
 // The following intrinsics were once defined here, but are now auto-upgraded
 // to target-generic LLVM intrinsics.
 //
-//   * llvm.nvvm.brev32  --> llvm.bitreverse.i32
-//   * llvm.nvvm.brev64  --> llvm.bitreverse.i64
-//   * llvm.nvvm.clz.i   --> llvm.ctlz.i32
-//   * llvm.nvvm.clz.ll  --> trunc i64 llvm.ctlz.i64(x) to i32
-//   * llvm.nvvm.popc.i  --> llvm.ctpop.i32
-//   * llvm.nvvm.popc.ll --> trunc i64 llvm.ctpop.i64 to i32
-//   * llvm.nvvm.abs.i   --> select(x >= -x, x, -x)
-//   * llvm.nvvm.abs.ll  --> ibid.
-//   * llvm.nvvm.max.i   --> select(x sge y, x, y)
-//   * llvm.nvvm.max.ll  --> ibid.
-//   * llvm.nvvm.max.ui  --> select(x uge y, x, y)
-//   * llvm.nvvm.max.ull --> ibid.
-//   * llvm.nvvm.max.i   --> select(x sle y, x, y)
-//   * llvm.nvvm.max.ll  --> ibid.
-//   * llvm.nvvm.max.ui  --> select(x ule y, x, y)
-//   * llvm.nvvm.max.ull --> ibid.
-//   * llvm.nvvm.h2f     --> llvm.convert.to.fp16.f32
+//   * llvm.nvvm.brev32              --> llvm.bitreverse.i32
+//   * llvm.nvvm.brev64              --> llvm.bitreverse.i64
+//   * llvm.nvvm.clz.i               --> llvm.ctlz.i32
+//   * llvm.nvvm.clz.ll              --> trunc i64 llvm.ctlz.i64(x) to i32
+//   * llvm.nvvm.popc.i              --> llvm.ctpop.i32
+//   * llvm.nvvm.popc.ll             --> trunc i64 llvm.ctpop.i64 to i32
+//   * llvm.nvvm.abs.i               --> select(x >= -x, x, -x)
+//   * llvm.nvvm.abs.ll              --> ibid.
+//   * llvm.nvvm.max.i               --> select(x sge y, x, y)
+//   * llvm.nvvm.max.ll              --> ibid.
+//   * llvm.nvvm.max.ui              --> select(x uge y, x, y)
+//   * llvm.nvvm.max.ull             --> ibid.
+//   * llvm.nvvm.max.i               --> select(x sle y, x, y)
+//   * llvm.nvvm.max.ll              --> ibid.
+//   * llvm.nvvm.max.ui              --> select(x ule y, x, y)
+//   * llvm.nvvm.max.ull             --> ibid.
+//   * llvm.nvvm.h2f                 --> llvm.convert.to.fp16.f32
 //   * llvm.nvvm.bitcast.f2i         --> bitcast
 //   * llvm.nvvm.bitcast.i2f         --> ibid.
 //   * llvm.nvvm.bitcast.d2ll        --> ibid.
@@ -134,6 +134,9 @@
 //   * llvm.nvvm.barrier             --> llvm.nvvm.barrier.cta.sync.aligned(x, y)
 //   * llvm.nvvm.barrier.sync        --> llvm.nvvm.barrier.cta.sync.all(x)
 //   * llvm.nvvm.barrier.sync.cnt    --> llvm.nvvm.barrier.cta.sync(x, y)
+//   * llvm.nvvm.barrier0.popc       --> llvm.nvvm.barrier.cta.red.popc.aligned.all(0, c)
+//   * llvm.nvvm.barrier0.and        --> llvm.nvvm.barrier.cta.red.and.aligned.all(0, z)
+//   * llvm.nvvm.barrier0.or         --> llvm.nvvm.barrier.cta.red.or.aligned.all(0, z)
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;         // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;         // (shared)ptr
@@ -1845,29 +1848,35 @@ let TargetPrefix = "nvvm" in {
   //
   // Bar.Sync
   //
-  def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
-  def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
-  def int_nvvm_barrier0_or : ClangBuiltin<"__nvvm_bar0_or">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
-
   def int_nvvm_bar_warp_sync : NVVMBuiltin,
       Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
 
-  // barrier{.cta}.sync{.aligned}      a{, b};
-  // barrier{.cta}.arrive{.aligned}    a, b;
-  let IntrProperties = [IntrConvergent, IntrNoCallback] in {
-    foreach align = ["", "_aligned"] in {
-      def int_nvvm_barrier_cta_sync # align # _all :
-          Intrinsic<[], [llvm_i32_ty]>;
-      def int_nvvm_barrier_cta_sync # align # _count :
-          Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
-      def int_nvvm_barrier_cta_arrive # align # _count :
-          Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
+  multiclass BarrierCTAIntrinsic<list<LLVMType> ret_types = [],
+                                 list<LLVMType> extra_param_types = [],
+                                 bit has_all_variant = true> {
+    let IntrProperties = [IntrConvergent, IntrNoCallback] in {
+      foreach align = ["", "_aligned"] in {
+        def align # _count :
+          Intrinsic<ret_types, [llvm_i32_ty, llvm_i32_ty] # extra_param_types>;
+        if has_all_variant then
+          def align # _all :
+            Intrinsic<ret_types, [llvm_i32_ty] # extra_param_types>;
+      }
     }
   }
 
+  // barrier{.cta}.sync{.aligned}      a{, b};
+  // barrier{.cta}.arrive{.aligned}    a, b;
+  defm int_nvvm_barrier_cta_sync : BarrierCTAIntrinsic<>;
+  defm int_nvvm_barrier_cta_arrive : BarrierCTAIntrinsic<has_all_variant = false>;
+
+  // barrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;
+  // barrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;
+  // .op = { .and, .or };
+  defm int_nvvm_barrier_cta_red_popc : BarrierCTAIntrinsic<[llvm_i32_ty], [llvm_i1_ty]>;
+  defm int_nvvm_barrier_cta_red_and : BarrierCTAIntrinsic<[llvm_i1_ty], [llvm_i1_ty]>;
+  defm int_nvvm_barrier_cta_red_or : BarrierCTAIntrinsic<[llvm_i1_ty], [llvm_i1_ty]>;
+
   let IntrProperties = [IntrConvergent, IntrNoCallback] in {
     // barrier.cluster.[wait, arrive, arrive.relaxed]
     def int_nvvm_barrier_cluster_arrive : Intrinsic<[]>;

diff  --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 5efede4f87680..60505ac208c2c 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1619,6 +1619,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
                      .Case("barrier.sync", true)
                      .Case("barrier", true)
                      .Case("bar.sync", true)
+                     .Case("barrier0.popc", true)
+                     .Case("barrier0.and", true)
+                     .Case("barrier0.or", true)
                      .Case("clz.ll", true)
                      .Case("popc.ll", true)
                      .Case("h2f", true)
@@ -2743,6 +2746,21 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
   } else if (Name == "barrier.sync.cnt") {
     Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_count, {},
                                   {CI->getArgOperand(0), CI->getArgOperand(1)});
+  } else if (Name == "barrier0.popc" || Name == "barrier0.and" ||
+             Name == "barrier0.or") {
+    Value *C = CI->getArgOperand(0);
+    C = Builder.CreateICmpNE(C, Builder.getInt32(0));
+
+    Intrinsic::ID IID =
+        StringSwitch<Intrinsic::ID>(Name)
+            .Case("barrier0.popc",
+                  Intrinsic::nvvm_barrier_cta_red_popc_aligned_all)
+            .Case("barrier0.and",
+                  Intrinsic::nvvm_barrier_cta_red_and_aligned_all)
+            .Case("barrier0.or",
+                  Intrinsic::nvvm_barrier_cta_red_or_aligned_all);
+    Value *Bar = Builder.CreateIntrinsic(IID, {}, {Builder.getInt32(0), C});
+    Rep = Builder.CreateZExt(Bar, CI->getType());
   } else {
     Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
     if (IID != Intrinsic::not_intrinsic &&

diff  --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 817006c367379..b145e1d53f46c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -89,31 +89,6 @@ class RegSeq<int n, string prefix> {
 // Synchronization and shuffle functions
 //-----------------------------------
 let isConvergent = true in {
-def INT_BARRIER0_POPC : NVPTXInst<(outs B32:$dst), (ins B32:$pred),
-  !strconcat("{{ \n\t",
-             ".reg .pred \t%p1; \n\t",
-             "setp.ne.u32 \t%p1, $pred, 0; \n\t",
-             "bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
-             "}}"),
-      [(set i32:$dst, (int_nvvm_barrier0_popc i32:$pred))]>;
-def INT_BARRIER0_AND : NVPTXInst<(outs B32:$dst), (ins B32:$pred),
-  !strconcat("{{ \n\t",
-             ".reg .pred \t%p1; \n\t",
-             ".reg .pred \t%p2; \n\t",
-             "setp.ne.u32 \t%p1, $pred, 0; \n\t",
-             "bar.red.and.pred \t%p2, 0, %p1; \n\t",
-             "selp.u32 \t$dst, 1, 0, %p2; \n\t",
-             "}}"),
-      [(set i32:$dst, (int_nvvm_barrier0_and i32:$pred))]>;
-def INT_BARRIER0_OR : NVPTXInst<(outs B32:$dst), (ins B32:$pred),
-  !strconcat("{{ \n\t",
-             ".reg .pred \t%p1; \n\t",
-             ".reg .pred \t%p2; \n\t",
-             "setp.ne.u32 \t%p1, $pred, 0; \n\t",
-             "bar.red.or.pred \t%p2, 0, %p1; \n\t",
-             "selp.u32 \t$dst, 1, 0, %p2; \n\t",
-             "}}"),
-      [(set i32:$dst, (int_nvvm_barrier0_or i32:$pred))]>;
 
 def INT_BAR_WARP_SYNC_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync",
                              [(int_nvvm_bar_warp_sync imm:$i)]>,
@@ -122,44 +97,73 @@ def INT_BAR_WARP_SYNC_R : BasicNVPTXInst<(outs), (ins B32:$i), "bar.warp.sync",
                              [(int_nvvm_bar_warp_sync i32:$i)]>,
         Requires<[hasPTX<60>, hasSM<30>]>;
 
-multiclass BARRIER1<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
-  def _i : BasicNVPTXInst<(outs), (ins i32imm:$i), asmstr,
-                          [(intrinsic imm:$i)]>,
-           Requires<requires>;
-
-  def _r : BasicNVPTXInst<(outs), (ins B32:$i), asmstr,
-                          [(intrinsic i32:$i)]>,
-           Requires<requires>;
+multiclass BARRIER_ALL<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
+  let Predicates = requires in {
+    def _i : BasicNVPTXInst<(outs), (ins i32imm:$i), asmstr, [(intrinsic imm:$i)]>;
+    def _r : BasicNVPTXInst<(outs), (ins B32:$i), asmstr, [(intrinsic i32:$i)]>;
+  }
 }
 
-multiclass BARRIER2<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
-  def _rr : BasicNVPTXInst<(outs), (ins B32:$i, B32:$j), asmstr,
-                          [(intrinsic i32:$i, i32:$j)]>,
-            Requires<requires>;
-
-  def _ri : BasicNVPTXInst<(outs), (ins B32:$i, i32imm:$j), asmstr,
-                          [(intrinsic i32:$i, imm:$j)]>,
-            Requires<requires>;
+multiclass BARRIER_COUNT<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
+  let Predicates = requires in {
+    def _rr : BasicNVPTXInst<(outs), (ins B32:$i, B32:$j), asmstr, 
+                             [(intrinsic i32:$i, i32:$j)]>;
+    def _ri : BasicNVPTXInst<(outs), (ins B32:$i, i32imm:$j), asmstr, 
+                             [(intrinsic i32:$i, imm:$j)]>;
+    def _ir : BasicNVPTXInst<(outs), (ins i32imm:$i, B32:$j), asmstr,
+                             [(intrinsic imm:$i, i32:$j)]>;
+    def _ii : BasicNVPTXInst<(outs), (ins i32imm:$i, i32imm:$j), asmstr,
+                             [(intrinsic imm:$i, imm:$j)]>;
+  }
+}
 
-  def _ir : BasicNVPTXInst<(outs), (ins i32imm:$i, B32:$j), asmstr,
-                          [(intrinsic imm:$i, i32:$j)]>,
-            Requires<requires>;
+multiclass BARRIER_RED_ALL<string asmstr, Intrinsic intrinsic, RegTyInfo dst_rt, list<Predicate> requires = []> {
+  let Predicates = requires in {
+    def _ip : BasicNVPTXInst<(outs dst_rt.RC:$dest), (ins i32imm:$i, B1:$pred), asmstr,
+                             [(set dst_rt.Ty:$dest, (intrinsic imm:$i, i1:$pred))]>;
+    def _rp : BasicNVPTXInst<(outs dst_rt.RC:$dest), (ins B32:$i, B1:$pred), asmstr, 
+                             [(set dst_rt.Ty:$dest, (intrinsic i32:$i, i1:$pred))]>;
+  }
+}
 
-  def _ii : BasicNVPTXInst<(outs), (ins i32imm:$i, i32imm:$j), asmstr,
-                          [(intrinsic imm:$i, imm:$j)]>,
-            Requires<requires>;
+multiclass BARRIER_RED_COUNT<string asmstr, Intrinsic intrinsic, RegTyInfo dst_rt, list<Predicate> requires = []> {
+  let Predicates = requires in {
+    def _rrp : BasicNVPTXInst<(outs dst_rt.RC:$dest), (ins B32:$i, B32:$j, B1:$pred), asmstr, 
+                              [(set dst_rt.Ty:$dest, (intrinsic i32:$i, i32:$j, i1:$pred))]>;
+    def _rip : BasicNVPTXInst<(outs dst_rt.RC:$dest), (ins B32:$i, i32imm:$j, B1:$pred), asmstr, 
+                              [(set dst_rt.Ty:$dest, (intrinsic i32:$i, imm:$j, i1:$pred))]>;
+    def _irp : BasicNVPTXInst<(outs dst_rt.RC:$dest), (ins i32imm:$i, B32:$j, B1:$pred), asmstr,
+                              [(set dst_rt.Ty:$dest, (intrinsic imm:$i, i32:$j, i1:$pred))]>;
+    def _iip : BasicNVPTXInst<(outs dst_rt.RC:$dest), (ins i32imm:$i, i32imm:$j, B1:$pred), asmstr,
+                              [(set dst_rt.Ty:$dest, (intrinsic imm:$i, imm:$j, i1:$pred))]>;
+  }
 }
 
+
 // Note the "bar.sync" variants could be renamed to the equivalent corresponding
 // "barrier.*.aligned" variants. We use the older syntax for compatibility with
 // older versions of the PTX ISA.
-defm BARRIER_CTA_SYNC_ALIGNED_ALL : BARRIER1<"bar.sync", int_nvvm_barrier_cta_sync_aligned_all>;
-defm BARRIER_CTA_SYNC_ALIGNED : BARRIER2<"bar.sync", int_nvvm_barrier_cta_sync_aligned_count>;
-defm BARRIER_CTA_ARRIVE_ALIGNED : BARRIER2<"bar.arrive", int_nvvm_barrier_cta_arrive_aligned_count>;
-
-defm BARRIER_CTA_SYNC_ALL : BARRIER1<"barrier.sync", int_nvvm_barrier_cta_sync_all, [hasPTX<60>]>;
-defm BARRIER_CTA_SYNC : BARRIER2<"barrier.sync", int_nvvm_barrier_cta_sync_count, [hasPTX<60>]>;
-defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive_count, [hasPTX<60>]>;
+defm BARRIER_CTA_SYNC_ALIGNED_ALL : BARRIER_ALL<"bar.sync", int_nvvm_barrier_cta_sync_aligned_all>;
+defm BARRIER_CTA_SYNC_ALIGNED : BARRIER_COUNT<"bar.sync", int_nvvm_barrier_cta_sync_aligned_count>;
+defm BARRIER_CTA_ARRIVE_ALIGNED : BARRIER_COUNT<"bar.arrive", int_nvvm_barrier_cta_arrive_aligned_count>;
+
+defm BARRIER_CTA_SYNC_ALL : BARRIER_ALL<"barrier.sync", int_nvvm_barrier_cta_sync_all, [hasPTX<60>]>;
+defm BARRIER_CTA_SYNC : BARRIER_COUNT<"barrier.sync", int_nvvm_barrier_cta_sync_count, [hasPTX<60>]>;
+defm BARRIER_CTA_ARRIVE : BARRIER_COUNT<"barrier.arrive", int_nvvm_barrier_cta_arrive_count, [hasPTX<60>]>;
+
+defm BARRIER_CTA_RED_POPC_ALIGNED_ALL : BARRIER_RED_ALL<"bar.red.popc", int_nvvm_barrier_cta_red_popc_aligned_all, I32RT>;
+defm BARRIER_CTA_RED_AND_ALIGNED_ALL : BARRIER_RED_ALL<"bar.red.and", int_nvvm_barrier_cta_red_and_aligned_all, I1RT>;
+defm BARRIER_CTA_RED_OR_ALIGNED_ALL : BARRIER_RED_ALL<"bar.red.or", int_nvvm_barrier_cta_red_or_aligned_all, I1RT>;
+defm BARRIER_CTA_RED_POPC_ALIGNED : BARRIER_RED_COUNT<"bar.red.popc", int_nvvm_barrier_cta_red_popc_aligned_count, I32RT>;
+defm BARRIER_CTA_RED_AND_ALIGNED : BARRIER_RED_COUNT<"bar.red.and", int_nvvm_barrier_cta_red_and_aligned_count, I1RT>;
+defm BARRIER_CTA_RED_OR_ALIGNED : BARRIER_RED_COUNT<"bar.red.or", int_nvvm_barrier_cta_red_or_aligned_count, I1RT>;
+
+defm BARRIER_CTA_RED_POPC_ALL : BARRIER_RED_ALL<"barrier.red.popc", int_nvvm_barrier_cta_red_popc_all, I32RT, [hasPTX<60>]>;
+defm BARRIER_CTA_RED_AND_ALL : BARRIER_RED_ALL<"barrier.red.and", int_nvvm_barrier_cta_red_and_all, I1RT, [hasPTX<60>]>;
+defm BARRIER_CTA_RED_OR_ALL : BARRIER_RED_ALL<"barrier.red.or", int_nvvm_barrier_cta_red_or_all, I1RT, [hasPTX<60>]>;
+defm BARRIER_CTA_RED_POPC_COUNT : BARRIER_RED_COUNT<"barrier.red.popc", int_nvvm_barrier_cta_red_popc_count, I32RT, [hasPTX<60>]>;
+defm BARRIER_CTA_RED_AND_COUNT : BARRIER_RED_COUNT<"barrier.red.and", int_nvvm_barrier_cta_red_and_count, I1RT, [hasPTX<60>]>;
+defm BARRIER_CTA_RED_OR_COUNT : BARRIER_RED_COUNT<"barrier.red.or", int_nvvm_barrier_cta_red_or_count, I1RT, [hasPTX<60>]>;
 
 class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
                           list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:

diff  --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 074d8edeee4fd..62546a03fb283 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -2149,9 +2149,12 @@ bool AANoSync::isAlignedBarrier(const CallBase &CB, bool ExecutedAligned) {
   switch (CB.getIntrinsicID()) {
   case Intrinsic::nvvm_barrier_cta_sync_aligned_all:
   case Intrinsic::nvvm_barrier_cta_sync_aligned_count:
-  case Intrinsic::nvvm_barrier0_and:
-  case Intrinsic::nvvm_barrier0_or:
-  case Intrinsic::nvvm_barrier0_popc:
+  case Intrinsic::nvvm_barrier_cta_red_and_aligned_all:
+  case Intrinsic::nvvm_barrier_cta_red_and_aligned_count:
+  case Intrinsic::nvvm_barrier_cta_red_or_aligned_all:
+  case Intrinsic::nvvm_barrier_cta_red_or_aligned_count:
+  case Intrinsic::nvvm_barrier_cta_red_popc_aligned_all:
+  case Intrinsic::nvvm_barrier_cta_red_popc_aligned_count:
     return true;
   case Intrinsic::amdgcn_s_barrier:
     if (ExecutedAligned)

diff  --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index 4fc506f1f5edf..0f749cf81f39b 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -87,6 +87,10 @@ declare void @llvm.nvvm.barrier(i32, i32)
 declare void @llvm.nvvm.barrier.sync(i32)
 declare void @llvm.nvvm.barrier.sync.cnt(i32, i32)
 
+declare i32 @llvm.nvvm.barrier0.popc(i32)
+declare i32 @llvm.nvvm.barrier0.and(i32)
+declare i32 @llvm.nvvm.barrier0.or(i32)
+
 declare float @llvm.nvvm.ex2.approx.f(float)
 declare double @llvm.nvvm.ex2.approx.d(double)
 declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
@@ -345,19 +349,33 @@ define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspa
   ret void
 }
 
-define void @cta_barriers(i32 %x, i32 %y) {
+define void @cta_barriers(i32 %x, i32 %y, i32 %z) {
 ; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
 ; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %x)
 ; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %x)
 ; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %x, i32 %y)
 ; CHECK: call void @llvm.nvvm.barrier.cta.sync.all(i32 %x)
 ; CHECK: call void @llvm.nvvm.barrier.cta.sync.count(i32 %x, i32 %y)
+
+; CHECK: %1 = icmp ne i32 %z, 0
+; CHECK: %2 = call i32 @llvm.nvvm.barrier.cta.red.popc.aligned.all(i32 0, i1 %1)
+; CHECK: %3 = icmp ne i32 %z, 0
+; CHECK: %4 = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.all(i32 0, i1 %3)
+; CHECK: %5 = zext i1 %4 to i32
+; CHECK: %6 = icmp ne i32 %z, 0
+; CHECK: %7 = call i1 @llvm.nvvm.barrier.cta.red.or.aligned.all(i32 0, i1 %6)
+; CHECK: %8 = zext i1 %7 to i32
+
   call void @llvm.nvvm.barrier0()
   call void @llvm.nvvm.barrier.n(i32 %x)
   call void @llvm.nvvm.bar.sync(i32 %x)
   call void @llvm.nvvm.barrier(i32 %x, i32 %y)
   call void @llvm.nvvm.barrier.sync(i32 %x)
   call void @llvm.nvvm.barrier.sync.cnt(i32 %x, i32 %y)
+
+  %r1 = call i32 @llvm.nvvm.barrier0.popc(i32 %z)
+  %r2 = call i32 @llvm.nvvm.barrier0.and(i32 %z)
+  %r3 = call i32 @llvm.nvvm.barrier0.or(i32 %z)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/NVPTX/barrier.ll b/llvm/test/CodeGen/NVPTX/barrier.ll
index f2d6f2354038f..c785f09fcf87e 100644
--- a/llvm/test/CodeGen/NVPTX/barrier.ll
+++ b/llvm/test/CodeGen/NVPTX/barrier.ll
@@ -134,3 +134,177 @@ define void @barrier_cta_arrive(i32 %id, i32 %cnt) {
   call void @llvm.nvvm.barrier.cta.arrive.count(i32 4, i32 64)
   ret void
 }
+
+define void @barrier_cta_red_popc_all(i32 %id, i1 %pred) {
+; CHECK-LABEL: barrier_cta_red_popc_all(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [barrier_cta_red_popc_all_param_1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [barrier_cta_red_popc_all_param_0];
+; CHECK-NEXT:    bar.red.popc %r2, %r1, %p1;
+; CHECK-NEXT:    bar.red.popc %r3, 3, %p1;
+; CHECK-NEXT:    barrier.red.popc %r4, %r1, %p1;
+; CHECK-NEXT:    barrier.red.popc %r5, 3, %p1;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.barrier.cta.red.popc.aligned.all(i32 %id, i1 %pred)
+  %v2 = call i32 @llvm.nvvm.barrier.cta.red.popc.aligned.all(i32 3, i1 %pred)
+  %v3 = call i32 @llvm.nvvm.barrier.cta.red.popc.all(i32 %id, i1 %pred)
+  %v4 = call i32 @llvm.nvvm.barrier.cta.red.popc.all(i32 3, i1 %pred)
+  ret void
+}
+
+define void @barrier_cta_red_popc_count(i32 %id, i32 %cnt, i1 %pred) {
+; CHECK-LABEL: barrier_cta_red_popc_count(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<11>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [barrier_cta_red_popc_count_param_2];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [barrier_cta_red_popc_count_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [barrier_cta_red_popc_count_param_1];
+; CHECK-NEXT:    bar.red.popc %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    bar.red.popc %r4, 3, %r2, %p1;
+; CHECK-NEXT:    barrier.red.popc %r5, %r1, %r2, %p1;
+; CHECK-NEXT:    barrier.red.popc %r6, 3, %r2, %p1;
+; CHECK-NEXT:    bar.red.popc %r7, %r1, 10, %p1;
+; CHECK-NEXT:    bar.red.popc %r8, 3, 11, %p1;
+; CHECK-NEXT:    barrier.red.popc %r9, %r1, 12, %p1;
+; CHECK-NEXT:    barrier.red.popc %r10, 3, 13, %p1;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.barrier.cta.red.popc.aligned.count(i32 %id, i32 %cnt, i1 %pred)
+  %v2 = call i32 @llvm.nvvm.barrier.cta.red.popc.aligned.count(i32 3, i32 %cnt, i1 %pred)
+  %v3 = call i32 @llvm.nvvm.barrier.cta.red.popc.count(i32 %id, i32 %cnt, i1 %pred)
+  %v4 = call i32 @llvm.nvvm.barrier.cta.red.popc.count(i32 3, i32 %cnt, i1 %pred)
+
+  %v5 = call i32 @llvm.nvvm.barrier.cta.red.popc.aligned.count(i32 %id, i32 10, i1 %pred)
+  %v6 = call i32 @llvm.nvvm.barrier.cta.red.popc.aligned.count(i32 3, i32 11, i1 %pred)
+  %v7 = call i32 @llvm.nvvm.barrier.cta.red.popc.count(i32 %id, i32 12, i1 %pred)
+  %v8 = call i32 @llvm.nvvm.barrier.cta.red.popc.count(i32 3, i32 13, i1 %pred)
+  ret void
+}
+
+define void @barrier_cta_red_and_all(i32 %id, i1 %pred) {
+; CHECK-LABEL: barrier_cta_red_and_all(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<6>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [barrier_cta_red_and_all_param_1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [barrier_cta_red_and_all_param_0];
+; CHECK-NEXT:    bar.red.and %p2, %r1, %p1;
+; CHECK-NEXT:    bar.red.and %p3, 3, %p1;
+; CHECK-NEXT:    barrier.red.and %p4, %r1, %p1;
+; CHECK-NEXT:    barrier.red.and %p5, 3, %p1;
+; CHECK-NEXT:    ret;
+  %v1 = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.all(i32 %id, i1 %pred)
+  %v2 = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.all(i32 3, i1 %pred)
+  %v3 = call i1 @llvm.nvvm.barrier.cta.red.and.all(i32 %id, i1 %pred)
+  %v4 = call i1 @llvm.nvvm.barrier.cta.red.and.all(i32 3, i1 %pred)
+  ret void
+}
+
+define void @barrier_cta_red_and_count(i32 %id, i32 %cnt, i1 %pred) {
+; CHECK-LABEL: barrier_cta_red_and_count(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<10>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [barrier_cta_red_and_count_param_2];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [barrier_cta_red_and_count_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [barrier_cta_red_and_count_param_1];
+; CHECK-NEXT:    bar.red.and %p2, %r1, %r2, %p1;
+; CHECK-NEXT:    bar.red.and %p3, 3, %r2, %p1;
+; CHECK-NEXT:    barrier.red.and %p4, %r1, %r2, %p1;
+; CHECK-NEXT:    barrier.red.and %p5, 3, %r2, %p1;
+; CHECK-NEXT:    bar.red.and %p6, %r1, 10, %p1;
+; CHECK-NEXT:    bar.red.and %p7, 3, 11, %p1;
+; CHECK-NEXT:    barrier.red.and %p8, %r1, 12, %p1;
+; CHECK-NEXT:    barrier.red.and %p9, 3, 13, %p1;
+; CHECK-NEXT:    ret;
+  %v1 = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.count(i32 %id, i32 %cnt, i1 %pred)
+  %v2 = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.count(i32 3, i32 %cnt, i1 %pred)
+  %v3 = call i1 @llvm.nvvm.barrier.cta.red.and.count(i32 %id, i32 %cnt, i1 %pred)
+  %v4 = call i1 @llvm.nvvm.barrier.cta.red.and.count(i32 3, i32 %cnt, i1 %pred)
+
+  %v5 = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.count(i32 %id, i32 10, i1 %pred)
+  %v6 = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.count(i32 3, i32 11, i1 %pred)
+  %v7 = call i1 @llvm.nvvm.barrier.cta.red.and.count(i32 %id, i32 12, i1 %pred)
+  %v8 = call i1 @llvm.nvvm.barrier.cta.red.and.count(i32 3, i32 13, i1 %pred)
+  ret void
+}
+
+define void @barrier_cta_red_or_all(i32 %id, i1 %pred) {
+; CHECK-LABEL: barrier_cta_red_or_all(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<6>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [barrier_cta_red_or_all_param_1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [barrier_cta_red_or_all_param_0];
+; CHECK-NEXT:    bar.red.or %p2, %r1, %p1;
+; CHECK-NEXT:    bar.red.or %p3, 3, %p1;
+; CHECK-NEXT:    barrier.red.or %p4, %r1, %p1;
+; CHECK-NEXT:    barrier.red.or %p5, 3, %p1;
+; CHECK-NEXT:    ret;
+  %v1 = call i1 @llvm.nvvm.barrier.cta.red.or.aligned.all(i32 %id, i1 %pred)
+  %v2 = call i1 @llvm.nvvm.barrier.cta.red.or.aligned.all(i32 3, i1 %pred)
+  %v3 = call i1 @llvm.nvvm.barrier.cta.red.or.all(i32 %id, i1 %pred)
+  %v4 = call i1 @llvm.nvvm.barrier.cta.red.or.all(i32 3, i1 %pred)
+  ret void
+}
+
+define void @barrier_cta_red_or_count(i32 %id, i32 %cnt, i1 %pred) {
+; CHECK-LABEL: barrier_cta_red_or_count(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<10>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [barrier_cta_red_or_count_param_2];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [barrier_cta_red_or_count_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [barrier_cta_red_or_count_param_1];
+; CHECK-NEXT:    bar.red.or %p2, %r1, %r2, %p1;
+; CHECK-NEXT:    bar.red.or %p3, 3, %r2, %p1;
+; CHECK-NEXT:    barrier.red.or %p4, %r1, %r2, %p1;
+; CHECK-NEXT:    barrier.red.or %p5, 3, %r2, %p1;
+; CHECK-NEXT:    bar.red.or %p6, %r1, 10, %p1;
+; CHECK-NEXT:    bar.red.or %p7, 3, 11, %p1;
+; CHECK-NEXT:    barrier.red.or %p8, %r1, 12, %p1;
+; CHECK-NEXT:    barrier.red.or %p9, 3, 13, %p1;
+; CHECK-NEXT:    ret;
+  %v1 = call i1 @llvm.nvvm.barrier.cta.red.or.aligned.count(i32 %id, i32 %cnt, i1 %pred)
+  %v2 = call i1 @llvm.nvvm.barrier.cta.red.or.aligned.count(i32 3, i32 %cnt, i1 %pred)
+  %v3 = call i1 @llvm.nvvm.barrier.cta.red.or.count(i32 %id, i32 %cnt, i1 %pred)
+  %v4 = call i1 @llvm.nvvm.barrier.cta.red.or.count(i32 3, i32 %cnt, i1 %pred)
+
+  %v5 = call i1 @llvm.nvvm.barrier.cta.red.or.aligned.count(i32 %id, i32 10, i1 %pred)
+  %v6 = call i1 @llvm.nvvm.barrier.cta.red.or.aligned.count(i32 3, i32 11, i1 %pred)
+  %v7 = call i1 @llvm.nvvm.barrier.cta.red.or.count(i32 %id, i32 12, i1 %pred)
+  %v8 = call i1 @llvm.nvvm.barrier.cta.red.or.count(i32 3, i32 13, i1 %pred)
+  ret void
+}

diff  --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 4105a0aec128b..ed9dad4389453 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1272,7 +1272,8 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
     auto [id, args] = NVVM::BarrierOp::getIntrinsicIDAndArgs(
                         *op, moduleTranslation, builder);
     if ($reductionOp)
-      $res = createIntrinsicCall(builder, id, args);
+      $res = builder.CreateZExt(createIntrinsicCall(builder, id, args),
+                                builder.getInt32Ty());
     else 
       createIntrinsicCall(builder, id, args);
   }];

diff  --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 6e22bdc3ab135..331d7a244310f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -3093,27 +3093,26 @@ mlir::NVVM::IDArgPair NVVM::BarrierOp::getIntrinsicIDAndArgs(
                                ? mt.lookupValue(thisOp.getBarrierId())
                                : builder.getInt32(0);
   llvm::Intrinsic::ID id;
-  llvm::SmallVector<llvm::Value *> args;
+  llvm::SmallVector<llvm::Value *> args = {barrierId};
   if (thisOp.getNumberOfThreads()) {
     id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_count;
-    args.push_back(barrierId);
     args.push_back(mt.lookupValue(thisOp.getNumberOfThreads()));
   } else if (thisOp.getReductionOp()) {
     switch (*thisOp.getReductionOp()) {
     case NVVM::BarrierReduction::AND:
-      id = llvm::Intrinsic::nvvm_barrier0_and;
+      id = llvm::Intrinsic::nvvm_barrier_cta_red_and_aligned_all;
       break;
     case NVVM::BarrierReduction::OR:
-      id = llvm::Intrinsic::nvvm_barrier0_or;
+      id = llvm::Intrinsic::nvvm_barrier_cta_red_or_aligned_all;
       break;
     case NVVM::BarrierReduction::POPC:
-      id = llvm::Intrinsic::nvvm_barrier0_popc;
+      id = llvm::Intrinsic::nvvm_barrier_cta_red_popc_aligned_all;
       break;
     }
-    args.push_back(mt.lookupValue(thisOp.getReductionPredicate()));
+    args.push_back(builder.CreateICmpNE(
+        mt.lookupValue(thisOp.getReductionPredicate()), builder.getInt32(0)));
   } else {
     id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all;
-    args.push_back(barrierId);
   }
 
   return {id, std::move(args)};

diff  --git a/mlir/test/Target/LLVMIR/nvvm/barrier.mlir b/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
index a18633ef208c6..7e654eb8dc572 100644
--- a/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
@@ -13,13 +13,16 @@ llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32, %redOperand :
   // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %[[barId]], i32 %[[numThreads]])
   // CHECK: nvvm.barrier id = %{{.*}} number_of_threads = %{{.*}}
   nvvm.barrier id = %barID number_of_threads = %numberOfThreads
-  // LLVM: %{{.*}} = call i32 @llvm.nvvm.barrier0.and(i32 %[[redOperand]])
+  // LLVM: %[[redOperandCmp1:.*]] = icmp ne i32 %[[redOperand]], 0
+  // LLVM: %{{.*}} = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.all(i32 0, i1 %[[redOperandCmp1]])
   // CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<and> %{{.*}} -> i32
   %0 = nvvm.barrier #nvvm.reduction<and> %redOperand -> i32
-  // LLVM: %{{.*}} = call i32 @llvm.nvvm.barrier0.or(i32 %[[redOperand]])
+  // LLVM: %[[redOperandCmp2:.*]] = icmp ne i32 %[[redOperand]], 0
+  // LLVM: %{{.*}} = call i1 @llvm.nvvm.barrier.cta.red.or.aligned.all(i32 0, i1 %[[redOperandCmp2]])
   // CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<or> %{{.*}} -> i32
   %1 = nvvm.barrier #nvvm.reduction<or> %redOperand -> i32
-  // LLVM: %{{.*}} = call i32 @llvm.nvvm.barrier0.popc(i32 %[[redOperand]])
+  // LLVM: %[[redOperandCmp3:.*]] = icmp ne i32 %[[redOperand]], 0
+  // LLVM: %{{.*}} = call i32 @llvm.nvvm.barrier.cta.red.popc.aligned.all(i32 0, i1 %[[redOperandCmp3]])
   // CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<popc> %{{.*}} -> i32
   %2 = nvvm.barrier #nvvm.reduction<popc> %redOperand -> i32
 


        


More information about the Mlir-commits mailing list