[Mlir-commits] [mlir] [mlir][gpu] Add gpu.ballot operation to GPU dialect (PR #188647)

Thu Mar 26 21:35:54 PDT 2026

https://github.com/bangtianliu updated https://github.com/llvm/llvm-project/pull/188647

>From 819a60f54a2cbc5ea7acbf4e4e2c90d0461b51d0 Mon Sep 17 00:00:00 2001
From: Bangtian Liu <liubangtian at gmail.com>
Date: Wed, 25 Mar 2026 17:49:09 -0700
Subject: [PATCH 1/2] [mlir][gpu] Add gpu.ballot operation to GPU dialect

Signed-off-by: Bangtian Liu <liubangtian at gmail.com>
---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 21 ++++++++++
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 23 ++++++++++-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        |  6 +++
 .../gpu-to-rocdl-invalid-ballot.mlir          | 41 +++++++++++++++++++
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   | 10 +++++
 .../GPU/broadcast-speculatability.mlir        | 18 ++++++++
 mlir/test/Dialect/GPU/ops.mlir                | 10 +++++
 7 files changed, 128 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-invalid-ballot.mlir

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 36e0875f53b0a..f0a4dd44c8f67 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -3288,4 +3288,25 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
   let hasVerifier = 1;
 }
 
+def GPU_BallotOp : GPU_Op<"ballot", [NoMemoryEffect]>,
+    Arguments<(ins I1:$predicate)>,
+    Results<(outs AnySignlessInteger:$result)> {
+  let summary = "Collects predicate values from all threads in a subgroup.";
+  let description = [{
+    The `ballot` op performs a ballot operation across all threads in a subgroup.
+    Each thread contributes its predicate value as a single bit. The result is an
+    integer where the Nth bit is set iff the Nth thread's predicate is true.
+
+    The result type must be a signless integer type. The exact supported widths
+    are target-dependent. Common GPU targets support i32 and i64.
+
+    Example:
+    ```mlir
+    %0 = gpu.ballot %pred : i32
+    %1 = gpu.ballot %pred : i64
+    ```
+  }];
+  let assemblyFormat = "$predicate attr-dict `:` type($result)";
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e08ec138c853a..1b74fa81f66fe 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -417,6 +417,27 @@ struct GPUSubgroupBroadcastOpToROCDL
   }
 };
 
+struct GPUBallotOpToROCDL : public ConvertOpToLLVMPattern<gpu::BallotOp> {
+  using ConvertOpToLLVMPattern<gpu::BallotOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::BallotOp op, gpu::BallotOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto intType = cast<IntegerType>(op.getType());
+    unsigned width = intType.getWidth();
+
+    // ROCDL ballot natively supports i32 and i64 for wavefront sizes of
+    // 32 and 64 lanes.
+    if (width != 32 && width != 64)
+      return rewriter.notifyMatchFailure(
+          op, "rocdl.ballot only supports i32 and i64 result types");
+
+    rewriter.replaceOpWithNewOp<ROCDL::BallotOp>(op, op.getType(),
+                                                 adaptor.getPredicate());
+    return success();
+  }
+};
+
 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
 
@@ -764,7 +785,7 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
   patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
-               GPUSubgroupBroadcastOpToROCDL>(converter);
+               GPUSubgroupBroadcastOpToROCDL, GPUBallotOpToROCDL>(converter);
   patterns.add<GPUSubgroupIdOpToROCDL, GPUSubgroupSizeOpToROCDL,
                GPUBarrierOpLowering>(converter, chipset);
 
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 5d409f71847c6..aff6bce57b5a0 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2619,6 +2619,12 @@ OpFoldResult gpu::SubgroupBroadcastOp::fold(FoldAdaptor /*adaptor*/) {
   return nullptr;
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_BallotOp
+//===----------------------------------------------------------------------===//
+
+// No custom implementations needed; ballot uses default behavior from ODS.
+
 //===----------------------------------------------------------------------===//
 // GPU KernelMetadataAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-invalid-ballot.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-invalid-ballot.mlir
new file mode 100644
index 0000000000000..a94ab3b5bb780
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-invalid-ballot.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx950' -split-input-file -verify-diagnostics
+
+// -----
+
+gpu.module @test_module {
+  func.func @ballot_i8(%pred: i1) -> i8 {
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot' that was explicitly marked illegal}}
+    %0 = gpu.ballot %pred : i8
+    func.return %0 : i8
+  }
+}
+
+// -----
+
+gpu.module @test_module {
+  func.func @ballot_i16(%pred: i1) -> i16 {
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot' that was explicitly marked illegal}}
+    %0 = gpu.ballot %pred : i16
+    func.return %0 : i16
+  }
+}
+
+// -----
+
+gpu.module @test_module {
+  func.func @ballot_i48(%pred: i1) -> i48 {
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot' that was explicitly marked illegal}}
+    %0 = gpu.ballot %pred : i48
+    func.return %0 : i48
+  }
+}
+
+// -----
+
+gpu.module @test_module {
+  func.func @ballot_i128(%pred: i1) -> i128 {
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot' that was explicitly marked illegal}}
+    %0 = gpu.ballot %pred : i128
+    func.return %0 : i128
+  }
+}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 5eaa2d0b4df28..68a5328b8eb77 100755
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -989,4 +989,14 @@ func.func @broadcast_3xi16(%arg0 : vector<3xi16>) -> vector<3xi16> {
   %0 = gpu.subgroup_broadcast %arg0, first_active_lane : vector<3xi16>
   func.return %0 : vector<3xi16>
 }
+
+// CHECK-LABEL: func @ballot
+//  CHECK-SAME: (%[[PRED:.*]]: i1)
+func.func @ballot(%pred: i1) -> (i32, i64) {
+  // CHECK: rocdl.ballot %[[PRED]] : i32
+  %0 = gpu.ballot %pred : i32
+  // CHECK: rocdl.ballot %[[PRED]] : i64
+  %1 = gpu.ballot %pred : i64
+  func.return %0, %1 : i32, i64
+}
 }
diff --git a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
index 3cf4853effee5..7062ce4e6f46f 100644
--- a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
+++ b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
 
 func.func private @side_effect(%arg0 : f32, %arg1 : f32)
+func.func private @use_i32(%arg0 : i32)
 
 // CHECK-LABEL: func @broadcast_hoisting
 //  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32, {{.*}}: index)
@@ -20,3 +21,20 @@ func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32, %arg2 : index) {
   }
   func.return
 }
+
+// CHECK-LABEL: func @ballot_no_hoisting
+//  CHECK-SAME: (%[[PRED:.*]]: i1, {{.*}}: index)
+func.func @ballot_no_hoisting(%pred: i1, %n: index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  // Ballot cannot be speculated across control flow because
+  // it depends on active lanes, which can change.
+  // CHECK: scf.for
+  // CHECK: %[[BALLOT:.*]] = gpu.ballot %[[PRED]] : i32
+  // CHECK: func.call @use_i32(%[[BALLOT]])
+  scf.for %i = %c0 to %n step %c1 {
+    %0 = gpu.ballot %pred : i32
+    func.call @use_i32(%0) : (i32) -> ()
+  }
+  func.return
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 1d05268ed4475..a5dad3f931cc1 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -586,3 +586,13 @@ func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) {
   %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
   func.return %0, %1 : f32, f32
 }
+
+// CHECK-LABEL: func @ballot
+//  CHECK-SAME: (%[[PRED:.*]]: i1)
+func.func @ballot(%pred: i1) -> (i32, i64) {
+  // CHECK: gpu.ballot %[[PRED]] : i32
+  %0 = gpu.ballot %pred : i32
+  // CHECK: gpu.ballot %[[PRED]] : i64
+  %1 = gpu.ballot %pred : i64
+  func.return %0, %1 : i32, i64
+}

>From 110ee0216d0aa40a7db21d883b42627d010e20c2 Mon Sep 17 00:00:00 2001
From: Bangtian Liu <liubangtian at gmail.com>
Date: Thu, 26 Mar 2026 21:34:58 -0700
Subject: [PATCH 2/2] add nvvm and spir-v lowering

Signed-off-by: Bangtian Liu <liubangtian at gmail.com>
---
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  39 +++++-
 mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp |  60 ++++++++-
 .../GPUToNVVM/gpu-to-nvvm-invalid-ballot.mlir |  31 +++++
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     |  18 +++
 mlir/test/Conversion/GPUToSPIRV/ballot.mlir   | 122 ++++++++++++++++++
 5 files changed, 266 insertions(+), 4 deletions(-)
 create mode 100644 mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-invalid-ballot.mlir
 create mode 100644 mlir/test/Conversion/GPUToSPIRV/ballot.mlir

diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 660b24b071b49..021bc4ab8b548 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -236,6 +236,41 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
   }
 };
 
+struct GPUBallotOpToNVVM : public ConvertOpToLLVMPattern<gpu::BallotOp> {
+  using ConvertOpToLLVMPattern<gpu::BallotOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::BallotOp op, gpu::BallotOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+    auto intType = cast<IntegerType>(op.getType());
+    unsigned width = intType.getWidth();
+
+    // NVVM ballot natively returns i32. For i64 results, zero-extend since
+    // NVIDIA warps have exactly 32 threads, so upper 32 bits are always zero.
+    if (width != 32 && width != 64)
+      return rewriter.notifyMatchFailure(
+          op, "nvvm.vote.sync ballot only supports i32 and i64 result types");
+
+    // Use full mask (-1) so all 32 lanes participate in the ballot.
+    Value mask = LLVM::ConstantOp::create(rewriter, loc, int32Type,
+                                          rewriter.getI32IntegerAttr(-1));
+
+    auto voteKind = NVVM::VoteSyncKindAttr::get(rewriter.getContext(),
+                                                NVVM::VoteSyncKind::ballot);
+    Value result = NVVM::VoteSyncOp::create(rewriter, loc, int32Type, mask,
+                                            adaptor.getPredicate(), voteKind);
+
+    if (width == 64) {
+      result = LLVM::ZExtOp::create(rewriter, loc, op.getType(), result);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
 /// Lowering of cf.assert into a conditional __assertfail.
 struct AssertOpToAssertfailLowering
     : public ConvertOpToLLVMPattern<cf::AssertOp> {
@@ -504,8 +539,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
   patterns.add<gpu::index_lowering::OpLowering<
       gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>(
       converter, IndexKind::Grid, IntrType::Dim, benefit);
-  patterns.add<GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(
-      converter, benefit);
+  patterns.add<GPULaneIdOpToNVVM, GPUBallotOpToNVVM, GPUShuffleOpLowering,
+               GPUReturnOpLowering>(converter, benefit);
 
   patterns.add<GPUDynamicSharedMemoryOpLowering>(
       converter, NVVM::kSharedMemoryAlignmentBit, benefit);
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
index c33a903d03393..b4472291a2d77 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -130,6 +130,15 @@ class GPURotateConversion final : public OpConversionPattern<gpu::RotateOp> {
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+class GPUBallotConversion final : public OpConversionPattern<gpu::BallotOp> {
+public:
+  using Base::Base;
+
+  LogicalResult
+  matchAndRewrite(gpu::BallotOp ballotOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 class GPUPrintfConversion final : public OpConversionPattern<gpu::PrintfOp> {
 public:
   using Base::Base;
@@ -542,6 +551,52 @@ LogicalResult GPURotateConversion::matchAndRewrite(
   return success();
 }
 
+LogicalResult GPUBallotConversion::matchAndRewrite(
+    gpu::BallotOp ballotOp, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  Location loc = ballotOp.getLoc();
+  auto scope = rewriter.getAttr<spirv::ScopeAttr>(spirv::Scope::Subgroup);
+  auto int32Type = rewriter.getI32Type();
+  auto vec4i32Type = VectorType::get({4}, int32Type);
+
+  // SPIR-V ballot returns vector<4xi32> to support subgroups up to 128 lanes.
+  Value ballot = spirv::GroupNonUniformBallotOp::create(rewriter, loc,
+                                                        vec4i32Type, scope,
+                                                        adaptor.getPredicate());
+
+  auto intType = cast<IntegerType>(ballotOp.getType());
+  unsigned width = intType.getWidth();
+
+  if (width == 32) {
+    Value result =
+        spirv::CompositeExtractOp::create(rewriter, loc, ballot, {0});
+    rewriter.replaceOp(ballotOp, result);
+  } else if (width == 64) {
+    // Combine first two vector elements: low 32 bits + (high 32 bits << 32).
+    Value low = spirv::CompositeExtractOp::create(rewriter, loc, ballot, {0});
+    Value high = spirv::CompositeExtractOp::create(rewriter, loc, ballot, {1});
+
+    auto int64Type = rewriter.getI64Type();
+    Value lowExt = spirv::UConvertOp::create(rewriter, loc, int64Type, low);
+    Value highExt = spirv::UConvertOp::create(rewriter, loc, int64Type, high);
+
+    Value shift32 = spirv::ConstantOp::create(
+        rewriter, loc, int64Type,
+        rewriter.getIntegerAttr(int64Type, 32));
+    Value highShifted =
+        spirv::ShiftLeftLogicalOp::create(rewriter, loc, highExt, shift32);
+
+    Value result =
+        spirv::BitwiseOrOp::create(rewriter, loc, lowExt, highShifted);
+    rewriter.replaceOp(ballotOp, result);
+  } else {
+    return rewriter.notifyMatchFailure(
+        ballotOp, "only i32 and i64 result types are supported for SPIR-V");
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Group ops
 //===----------------------------------------------------------------------===//
@@ -830,8 +885,9 @@ LogicalResult GPUPrintfConversion::matchAndRewrite(
 void mlir::populateGPUToSPIRVPatterns(const SPIRVTypeConverter &typeConverter,
                                       RewritePatternSet &patterns) {
   patterns.add<
-      GPUBarrierConversion, GPUFuncOpConversion, GPUModuleConversion,
-      GPUReturnOpConversion, GPUShuffleConversion, GPURotateConversion,
+      GPUBarrierConversion, GPUBallotConversion, GPUFuncOpConversion,
+      GPUModuleConversion, GPUReturnOpConversion, GPUShuffleConversion,
+      GPURotateConversion,
       LaunchConfigConversion<gpu::BlockIdOp, spirv::BuiltIn::WorkgroupId>,
       LaunchConfigConversion<gpu::GridDimOp, spirv::BuiltIn::NumWorkgroups>,
       LaunchConfigConversion<gpu::BlockDimOp, spirv::BuiltIn::WorkgroupSize>,
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-invalid-ballot.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-invalid-ballot.mlir
new file mode 100644
index 0000000000000..d648fdca3b7b1
--- /dev/null
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-invalid-ballot.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file -verify-diagnostics
+
+// -----
+
+gpu.module @test_module {
+  func.func @ballot_i8(%pred: i1) -> i8 {
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot' that was explicitly marked illegal}}
+    %0 = gpu.ballot %pred : i8
+    func.return %0 : i8
+  }
+}
+
+// -----
+
+gpu.module @test_module {
+  func.func @ballot_i16(%pred: i1) -> i16 {
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot' that was explicitly marked illegal}}
+    %0 = gpu.ballot %pred : i16
+    func.return %0 : i16
+  }
+}
+
+// -----
+
+gpu.module @test_module {
+  func.func @ballot_i128(%pred: i1) -> i128 {
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot' that was explicitly marked illegal}}
+    %0 = gpu.ballot %pred : i128
+    func.return %0 : i128
+  }
+}
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 929794f035b9f..e717c515bd1d6 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1197,3 +1197,21 @@ gpu.module @test_module_cluster_block_ops {
   }
 }
 
+// -----
+
+module attributes {gpu.container_module} {
+  gpu.module @kernels {
+    // CHECK-LABEL: llvm.func @gpu_ballot
+    gpu.func @gpu_ballot(%arg0: i1) -> (i32, i64) {
+      // CHECK: %[[BALLOT_MASK1:.*]] = llvm.mlir.constant(-1 : i32) : i32
+      // CHECK: %[[BALLOT_I32:.*]] = nvvm.vote.sync ballot %[[BALLOT_MASK1]], %{{.*}} -> i32
+      %0 = gpu.ballot %arg0 : i32
+      // CHECK: %[[BALLOT_MASK2:.*]] = llvm.mlir.constant(-1 : i32) : i32
+      // CHECK: %[[BALLOT_I64_TMP:.*]] = nvvm.vote.sync ballot %[[BALLOT_MASK2]], %{{.*}} -> i32
+      // CHECK: %[[BALLOT_I64:.*]] = llvm.zext %[[BALLOT_I64_TMP]] : i32 to i64
+      %1 = gpu.ballot %arg0 : i64
+      gpu.return %0, %1 : i32, i64
+    }
+  }
+}
+
diff --git a/mlir/test/Conversion/GPUToSPIRV/ballot.mlir b/mlir/test/Conversion/GPUToSPIRV/ballot.mlir
new file mode 100644
index 0000000000000..4e8d60e432371
--- /dev/null
+++ b/mlir/test/Conversion/GPUToSPIRV/ballot.mlir
@@ -0,0 +1,122 @@
+// RUN: mlir-opt -split-input-file -convert-gpu-to-spirv -verify-diagnostics %s -o - | FileCheck %s
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniformBallot], []>, #spirv.resource_limits<>>
+} {
+
+gpu.module @kernels {
+  // CHECK-LABEL: spirv.func @ballot_i32
+  gpu.func @ballot_i32() kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>} {
+    %c1 = arith.constant 1 : index
+    %lane_id = gpu.lane_id
+    %pred = arith.cmpi ult, %lane_id, %c1 : index
+
+    // CHECK: %[[VEC:.*]] = spirv.GroupNonUniformBallot <Subgroup> %{{.*}} : vector<4xi32>
+    // CHECK: %{{.*}} = spirv.CompositeExtract %[[VEC]][0 : i32] : vector<4xi32>
+    %result = gpu.ballot %pred : i32
+    gpu.return
+  }
+}
+
+}
+
+// -----
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniformBallot, Int64], []>, #spirv.resource_limits<>>
+} {
+
+gpu.module @kernels {
+  // CHECK-LABEL: spirv.func @ballot_i64
+  gpu.func @ballot_i64() kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>} {
+    %c1 = arith.constant 1 : index
+    %lane_id = gpu.lane_id
+    %pred = arith.cmpi ult, %lane_id, %c1 : index
+
+    // CHECK: %[[VEC:.*]] = spirv.GroupNonUniformBallot <Subgroup> %{{.*}} : vector<4xi32>
+    // CHECK: %[[LOW:.*]] = spirv.CompositeExtract %[[VEC]][0 : i32] : vector<4xi32>
+    // CHECK: %[[HIGH:.*]] = spirv.CompositeExtract %[[VEC]][1 : i32] : vector<4xi32>
+    // CHECK: %[[LOW_EXT:.*]] = spirv.UConvert %[[LOW]] : i32 to i64
+    // CHECK: %[[HIGH_EXT:.*]] = spirv.UConvert %[[HIGH]] : i32 to i64
+    // CHECK: %[[C32:.*]] = spirv.Constant 32 : i64
+    // CHECK: %[[HIGH_SHIFTED:.*]] = spirv.ShiftLeftLogical %[[HIGH_EXT]], %[[C32]] : i64, i64
+    // CHECK: %{{.*}} = spirv.BitwiseOr %[[LOW_EXT]], %[[HIGH_SHIFTED]] : i64
+    %result = gpu.ballot %pred : i64
+    gpu.return
+  }
+}
+
+}
+
+// -----
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniformBallot], []>, #spirv.resource_limits<>>
+} {
+
+gpu.module @kernels {
+  gpu.func @ballot_invalid_i8() kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>} {
+    %c1 = arith.constant 1 : index
+    %lane_id = gpu.lane_id
+    %pred = arith.cmpi ult, %lane_id, %c1 : index
+
+    // Cannot convert i8 ballot result type
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot'}}
+    %result = gpu.ballot %pred : i8
+    gpu.return
+  }
+}
+
+}
+
+// -----
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniformBallot], []>, #spirv.resource_limits<>>
+} {
+
+gpu.module @kernels {
+  gpu.func @ballot_invalid_i16() kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>} {
+    %c1 = arith.constant 1 : index
+    %lane_id = gpu.lane_id
+    %pred = arith.cmpi ult, %lane_id, %c1 : index
+
+    // Cannot convert i16 ballot result type
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot'}}
+    %result = gpu.ballot %pred : i16
+    gpu.return
+  }
+}
+
+}
+
+// -----
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniformBallot], []>, #spirv.resource_limits<>>
+} {
+
+gpu.module @kernels {
+  gpu.func @ballot_invalid_i128() kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>} {
+    %c1 = arith.constant 1 : index
+    %lane_id = gpu.lane_id
+    %pred = arith.cmpi ult, %lane_id, %c1 : index
+
+    // Cannot convert i128 ballot result type
+    // expected-error @+1 {{failed to legalize operation 'gpu.ballot'}}
+    %result = gpu.ballot %pred : i128
+    gpu.return
+  }
+}
+
+}