[clang] [llvm] [HLSL] Implement elementwise popcount (PR #108121)

Tue Sep 17 16:51:22 PDT 2024

https://github.com/spall updated https://github.com/llvm/llvm-project/pull/108121

>From ebb4078d485559f70d79d3b10dc9f4ce401a3261 Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Fri, 6 Sep 2024 21:03:05 +0000
Subject: [PATCH 1/3] implement elementwise popcount to implement countbits

---
 clang/docs/LanguageExtensions.rst             |  1 +
 clang/include/clang/Basic/Builtins.td         |  6 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |  3 +
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      | 71 +++++++++++++++++++
 clang/lib/Sema/SemaChecking.cpp               |  2 +-
 clang/lib/Sema/SemaHLSL.cpp                   |  8 +++
 .../test/CodeGen/builtins-elementwise-math.c  | 37 ++++++++++
 clang/test/Sema/builtins-elementwise-math.c   | 21 ++++++
 .../SemaCXX/builtins-elementwise-math.cpp     |  8 +++
 llvm/lib/Target/DirectX/DXIL.td               | 11 +++
 llvm/test/CodeGen/DirectX/countbits.ll        | 31 ++++++++
 llvm/test/CodeGen/DirectX/countbits_error.ll  | 10 +++
 .../SPIRV/hlsl-intrinsics/countbits.ll        | 21 ++++++
 13 files changed, 229 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/DirectX/countbits.ll
 create mode 100644 llvm/test/CodeGen/DirectX/countbits_error.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index c08697282cbfe8..f62f90fb9650a9 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -667,6 +667,7 @@ Unless specified otherwise operation(±0) = ±0 and operation(±infinity) = ±in
  T __builtin_elementwise_log(T x)            return the natural logarithm of x                                floating point types
  T __builtin_elementwise_log2(T x)           return the base 2 logarithm of x                                 floating point types
  T __builtin_elementwise_log10(T x)          return the base 10 logarithm of x                                floating point types
+ T __builtin_elementwise_popcount(T x)       return the number of 1 bits in x                                 integer types 
  T __builtin_elementwise_pow(T x, T y)       return x raised to the power of y                                floating point types
  T __builtin_elementwise_bitreverse(T x)     return the integer represented after reversing the bits of x     integer types
  T __builtin_elementwise_exp(T x)            returns the base-e exponential, e^x, of the specified value      floating point types
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 6cf03d27055cd9..8c5d7ad763bf97 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1322,6 +1322,12 @@ def ElementwiseLog10 : Builtin {
   let Prototype = "void(...)";
 }
 
+def ElementwisePopcount : Builtin {
+  let Spellings = ["__builtin_elementwise_popcount"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ElementwisePow : Builtin {
   let Spellings = ["__builtin_elementwise_pow"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a52e880a764252..df2b8b5595e8b3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3834,6 +3834,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_elementwise_floor:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::floor, "elt.floor"));
+  case Builtin::BI__builtin_elementwise_popcount:
+    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
+	*this, E, llvm::Intrinsic::ctpop, "elt.ctpop"));
   case Builtin::BI__builtin_elementwise_roundeven:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::roundeven, "elt.roundeven"));
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 6a50d50ebd3479..6cd6a2caf19994 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -650,6 +650,77 @@ float3 cosh(float3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_cosh)
 float4 cosh(float4);
 
+//===----------------------------------------------------------------------===//
+// count bits builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T countbits(T Val)
+/// \brief Return the number of bits (per component) set in the input integer.
+/// \param Val The input value.
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int16_t countbits(int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int16_t2 countbits(int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int16_t3 countbits(int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int16_t4 countbits(int16_t4);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint16_t countbits(uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint16_t2 countbits(uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint16_t3 countbits(uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint16_t4 countbits(uint16_t4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int countbits(int);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int2 countbits(int2);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int3 countbits(int3);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int4 countbits(int4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint countbits(uint);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint2 countbits(uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint3 countbits(uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint4 countbits(uint4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int64_t countbits(int64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int64_t2 countbits(int64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int64_t3 countbits(int64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+int64_t4 countbits(int64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint64_t countbits(uint64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint64_t2 countbits(uint64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint64_t3 countbits(uint64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
+uint64_t4 countbits(uint64_t4);
+
 //===----------------------------------------------------------------------===//
 // dot product builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 99500daca295c9..d2570119c3432d 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2795,7 +2795,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     if (BuiltinElementwiseMath(TheCall))
       return ExprError();
     break;
-
+  case Builtin::BI__builtin_elementwise_popcount:
   case Builtin::BI__builtin_elementwise_bitreverse: {
     if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
       return ExprError();
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index a303f211501348..1d60bc6b7512be 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1601,6 +1601,14 @@ bool CheckUnsignedIntRepresentation(Sema *S, CallExpr *TheCall) {
                                   checkAllUnsignedTypes);
 }
 
+bool CheckIntRepresentation(Sema *S, CallExpr *TheCall) {
+  auto checkAllIntTypes = [](clang::QualType PassedType) -> bool {
+    return !PassedType->hasIntegerRepresentation();
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.IntTy,
+				  checkAllIntTypes);
+}
+
 void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
                                 QualType ReturnType) {
   auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
index 8fb52992c0fe68..7e094a52653ef0 100644
--- a/clang/test/CodeGen/builtins-elementwise-math.c
+++ b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -570,6 +570,43 @@ void test_builtin_elementwise_log2(float f1, float f2, double d1, double d2,
   vf2 = __builtin_elementwise_log2(vf1);
 }
 
+void test_builtin_elementwise_popcount(si8 vi1, si8 vi2,
+                                  long long int i1, long long int i2, short si,
+                                  _BitInt(31) bi1, _BitInt(31) bi2) {
+
+  
+  // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
+  // CHECK-NEXT: call i64 @llvm.ctpop.i64(i64 [[I1]])
+  i2 = __builtin_elementwise_popcount(i1);
+
+  // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
+  // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[VI1]])
+  vi2 = __builtin_elementwise_popcount(vi1);
+
+  // CHECK:      [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
+  // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[CVI2]])
+  const si8 cvi2 = vi2;
+  vi2 = __builtin_elementwise_popcount(cvi2);
+
+  // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
+  // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
+  // CHECK-NEXT: call i31 @llvm.ctpop.i31(i31 [[LOADEDV]])
+  bi2 = __builtin_elementwise_popcount(bi1);
+
+  // CHECK:      [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
+  // CHECK-NEXT: call i32 @llvm.ctpop.i32(i32 [[IA1]])
+  b = __builtin_elementwise_popcount(int_as_one);
+
+  // CHECK:   call i32 @llvm.ctpop.i32(i32 -10)
+  b = __builtin_elementwise_popcount(-10);
+
+  // CHECK:      [[SI:%.+]] = load i16, ptr %si.addr, align 2
+  // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
+  // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.ctpop.i32(i32 [[SI_EXT]])
+  // CHECK-NEXT: = trunc i32 [[RES]] to i16
+  si = __builtin_elementwise_popcount(si);
+}
+
 void test_builtin_elementwise_pow(float f1, float f2, double d1, double d2,
                                       float4 vf1, float4 vf2) {
 
diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
index 628274380ae5f2..4383ad8d40088b 100644
--- a/clang/test/Sema/builtins-elementwise-math.c
+++ b/clang/test/Sema/builtins-elementwise-math.c
@@ -505,6 +505,27 @@ void test_builtin_elementwise_log2(int i, float f, double d, float4 v, int3 iv,
   // expected-error at -1 {{1st argument must be a floating point type (was 'unsigned4' (vector of 4 'unsigned int' values))}}
 }
 
+void test_builtin_elementwise_popcount(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
+
+  struct Foo s = __builtin_elementwise_popcount(i);
+  // expected-error at -1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}}
+
+  i = __builtin_elementwise_popcount();
+  // expected-error at -1 {{too few arguments to function call, expected 1, have 0}}
+
+  i = __builtin_elementwise_popcount(f);
+  // expected-error at -1 {{1st argument must be a vector of integers (was 'float')}}
+
+  i = __builtin_elementwise_popcount(f, f);
+  // expected-error at -1 {{too many arguments to function call, expected 1, have 2}}
+
+  u = __builtin_elementwise_popcount(d);
+  // expected-error at -1 {{1st argument must be a vector of integers (was 'double')}}
+
+  v = __builtin_elementwise_popcount(v);
+  // expected-error at -1 {{1st argument must be a vector of integers (was 'float4' (vector of 4 'float' values))}}
+}
+
 void test_builtin_elementwise_pow(int i, short s, double d, float4 v, int3 iv, unsigned3 uv, int *p) {
   i = __builtin_elementwise_pow(p, d);
   // expected-error at -1 {{arguments are of different types ('int *' vs 'double')}}
diff --git a/clang/test/SemaCXX/builtins-elementwise-math.cpp b/clang/test/SemaCXX/builtins-elementwise-math.cpp
index 898d869f4c81be..c3d8bc593c0bbc 100644
--- a/clang/test/SemaCXX/builtins-elementwise-math.cpp
+++ b/clang/test/SemaCXX/builtins-elementwise-math.cpp
@@ -269,3 +269,11 @@ void test_builtin_elementwise_bitreverse() {
   static_assert(!is_const<decltype(__builtin_elementwise_bitreverse(a))>::value);
   static_assert(!is_const<decltype(__builtin_elementwise_bitreverse(b))>::value);  
 }
+
+void test_builtin_elementwise_popcount() {
+  const int a = 2;
+  int b = 1;
+  static_assert(!is_const<decltype(__builtin_elementwise_popcount(a))>::value);
+  static_assert(!is_const<decltype(__builtin_elementwise_popcount(b))>::value);  
+}
+
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 902ab37bf741ed..9aa0af3e3a6b17 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -553,6 +553,17 @@ def Rbits :  DXILOp<30, unary> {
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
+def CBits :  DXILOp<31, unary> {
+  let Doc = "Returns the number of 1 bits in the specified value.";
+  let LLVMIntrinsic = int_ctpop;
+  let arguments = [OverloadTy];
+  let result = OverloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [Int16Ty, Int32Ty, Int64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
 def FMax :  DXILOp<35, binary> {
   let Doc = "Float maximum. FMax(a,b) = a > b ? a : b";
   let LLVMIntrinsic = int_maxnum;
diff --git a/llvm/test/CodeGen/DirectX/countbits.ll b/llvm/test/CodeGen/DirectX/countbits.ll
new file mode 100644
index 00000000000000..9ebce58109e871
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/countbits.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Make sure dxil operation function calls for countbits are generated for all integer types.
+
+; Function Attrs: nounwind
+define noundef i16 @test_countbits_short(i16 noundef %a) {
+entry:
+; CHECK:call i16 @dx.op.unary.i16(i32 31, i16 %{{.*}})
+  %elt.ctpop = call i16 @llvm.ctpop.i16(i16 %a)
+  ret i16 %elt.ctpop
+}
+
+; Function Attrs: nounwind
+define noundef i32 @test_countbits_int(i32 noundef %a) {
+entry:
+; CHECK:call i32 @dx.op.unary.i32(i32 31, i32 %{{.*}})
+  %elt.ctpop = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %elt.ctpop
+}
+
+; Function Attrs: nounwind
+define noundef i64 @test_countbits_long(i64 noundef %a) {
+entry:
+; CHECK:call i64 @dx.op.unary.i64(i32 31, i64 %{{.*}})
+  %elt.ctpop = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %elt.ctpop
+}
+
+declare i16 @llvm.ctpop.i16(i16)
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
diff --git a/llvm/test/CodeGen/DirectX/countbits_error.ll b/llvm/test/CodeGen/DirectX/countbits_error.ll
new file mode 100644
index 00000000000000..e7adb103eaae7c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/countbits_error.ll
@@ -0,0 +1,10 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation ctpop does not support double overload type
+; CHECK: invalid intrinsic signature
+
+define noundef double @countbits_double(double noundef %a) {
+entry:
+  %elt.ctpop = call double @llvm.ctpop.f64(double %a)
+  ret double %elt.ctpop
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll
new file mode 100644
index 00000000000000..57ec0bda2e1890
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll
@@ -0,0 +1,21 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpMemoryModel Logical GLSL450
+
+define noundef i32 @countbits_i32(i32 noundef %a) {
+entry:
+; CHECK: %[[#]] = OpBitCount %[[#]] %[[#]]
+  %elt.bitreverse = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %elt.bitreverse
+}
+
+define noundef i16 @countbits_i16(i16 noundef %a) {
+entry:
+; CHECK: %[[#]] = OpBitCount %[[#]] %[[#]]
+  %elt.ctpop = call i16 @llvm.ctpop.i16(i16 %a)
+  ret i16 %elt.ctpop
+}
+
+declare i16 @llvm.ctpop.i16(i16)
+declare i32 @llvm.ctpop.i32(i32)

>From 73f26c9249913aec31a463a84b2afb39c6a57ddb Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Wed, 11 Sep 2024 17:06:28 +0000
Subject: [PATCH 2/3] address pull request comments and make clang format happy

---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  2 +-
 clang/lib/Sema/SemaHLSL.cpp                   |  8 --
 .../test/CodeGenHLSL/builtins/countbits.hlsl  | 80 +++++++++++++++++++
 clang/test/Sema/builtins-elementwise-math.c   | 12 +++
 clang/test/Sema/countbits-errors.hlsl         | 28 +++++++
 .../SemaHLSL/BuiltIns/countbits-errors.hlsl   | 27 +++++++
 llvm/test/CodeGen/DirectX/countbits.ll        |  9 +--
 llvm/test/CodeGen/DirectX/countbits_error.ll  | 10 ---
 9 files changed, 152 insertions(+), 25 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/countbits.hlsl
 create mode 100644 clang/test/Sema/countbits-errors.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
 delete mode 100644 llvm/test/CodeGen/DirectX/countbits_error.ll

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d92b59334f8f32..b995e9637a1d64 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -114,6 +114,7 @@ C++ Language Changes
 
 - Accept C++26 user-defined ``static_assert`` messages in C++11 as an extension.
 
+- Add ``__builtin_elementwise_popcount`` builtin for integer types only.
 
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index df2b8b5595e8b3..7e18aafcdd4b8a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3836,7 +3836,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         *this, E, llvm::Intrinsic::floor, "elt.floor"));
   case Builtin::BI__builtin_elementwise_popcount:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-	*this, E, llvm::Intrinsic::ctpop, "elt.ctpop"));
+        *this, E, llvm::Intrinsic::ctpop, "elt.ctpop"));
   case Builtin::BI__builtin_elementwise_roundeven:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::roundeven, "elt.roundeven"));
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 1d60bc6b7512be..a303f211501348 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1601,14 +1601,6 @@ bool CheckUnsignedIntRepresentation(Sema *S, CallExpr *TheCall) {
                                   checkAllUnsignedTypes);
 }
 
-bool CheckIntRepresentation(Sema *S, CallExpr *TheCall) {
-  auto checkAllIntTypes = [](clang::QualType PassedType) -> bool {
-    return !PassedType->hasIntegerRepresentation();
-  };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.IntTy,
-				  checkAllIntTypes);
-}
-
 void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
                                 QualType ReturnType) {
   auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
diff --git a/clang/test/CodeGenHLSL/builtins/countbits.hlsl b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
new file mode 100644
index 00000000000000..b4028433dd9637
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
@@ -0,0 +1,80 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
+
+#ifdef __HLSL_ENABLE_16_BIT
+// CHECK: define noundef i16 @
+// CHECK: call i16 @llvm.ctpop.i16(
+uint16_t test_countbits_ushort(uint16_t p0)
+{
+	return countbits(p0);
+}
+// CHECK: define noundef <2 x i16> @
+// CHECK: call <2 x i16> @llvm.ctpop.v2i16
+uint16_t2 test_countbits_ushort2(uint16_t2 p0)
+{
+	return countbits(p0);
+}
+// CHECK: define noundef <3 x i16> @
+// CHECK: call <3 x i16> @llvm.ctpop.v3i16
+uint16_t3 test_countbits_ushort3(uint16_t3 p0)
+{
+	return countbits(p0);
+}
+// CHECK: define noundef <4 x i16> @
+// CHECK: call <4 x i16> @llvm.ctpop.v4i16
+uint16_t4 test_countbits_ushort4(uint16_t4 p0)
+{
+	return countbits(p0);
+}
+#endif
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.ctpop.i32(
+int test_countbits_uint(uint p0)
+{
+	return countbits(p0);
+}
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.ctpop.v2i32
+uint2 test_countbits_uint2(uint2 p0)
+{
+	return countbits(p0);
+}
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.ctpop.v3i32
+uint3 test_countbits_uint3(uint3 p0)
+{
+	return countbits(p0);
+}
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.ctpop.v4i32
+uint4 test_countbits_uint4(uint4 p0)
+{
+	return countbits(p0);
+}
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.ctpop.i64(
+uint64_t test_countbits_long(uint64_t p0)
+{
+	return countbits(p0);
+}
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.ctpop.v2i64
+uint64_t2 test_countbits_long2(uint64_t2 p0)
+{
+	return countbits(p0);
+}
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.ctpop.v3i64
+uint64_t3 test_countbits_long3(uint64_t3 p0)
+{
+	return countbits(p0);
+}
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.ctpop.v4i64
+uint64_t4 test_countbits_long4(uint64_t4 p0)
+{
+	return countbits(p0);
+}
diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
index 4383ad8d40088b..1727be1d6286d5 100644
--- a/clang/test/Sema/builtins-elementwise-math.c
+++ b/clang/test/Sema/builtins-elementwise-math.c
@@ -524,6 +524,18 @@ void test_builtin_elementwise_popcount(int i, float f, double d, float4 v, int3
 
   v = __builtin_elementwise_popcount(v);
   // expected-error at -1 {{1st argument must be a vector of integers (was 'float4' (vector of 4 'float' values))}}
+
+  int2 i2 = __builtin_elementwise_popcount(iv);
+  // expected-error at -1 {{initializing 'int2' (vector of 2 'int' values) with an expression of incompatible type 'int3' (vector of 3 'int' values)}}
+
+  iv = __builtin_elementwise_popcount(i2);
+  // expected-error at -1 {{assigning to 'int3' (vector of 3 'int' values) from incompatible type 'int2' (vector of 2 'int' values)}}
+
+  unsigned3 u3 = __builtin_elementwise_popcount(iv);
+  // expected-error at -1 {{initializing 'unsigned3' (vector of 3 'unsigned int' values) with an expression of incompatible type 'int3' (vector of 3 'int' values)}}
+
+  iv = __builtin_elementwise_popcount(u3);
+  // expected-error at -1 {{assigning to 'int3' (vector of 3 'int' values) from incompatible type 'unsigned3' (vector of 3 'unsigned int' values)}}
 }
 
 void test_builtin_elementwise_pow(int i, short s, double d, float4 v, int3 iv, unsigned3 uv, int *p) {
diff --git a/clang/test/Sema/countbits-errors.hlsl b/clang/test/Sema/countbits-errors.hlsl
new file mode 100644
index 00000000000000..0fd36fe78d79f0
--- /dev/null
+++ b/clang/test/Sema/countbits-errors.hlsl
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -finclude-default-header
+// -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only
+// -disable-llvm-passes -verify
+
+double2 test_int_builtin(double2 p0) {
+  return __builtin_hlsl_elementwise_countbits(p0);
+  // expected-error at -1 {{passing 'double2' (aka 'vector<double, 2>') to
+  // parameter of incompatible type
+  // '__attribute__((__vector_size__(2 * sizeof(int)))) int'
+  // (vector of 2 'int' values)}}
+}
+
+float test_ambiguous(float p0) {
+  return countbits(p0);
+  // expected-error at -1 {{call to 'countbits' is ambiguous}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}  
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}  
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+}
+
+float test_float_builtin(float p0) {
+  return __builtin_hlsl_elementwise_countbits(p0);
+  // expected-error at -1 {{passing 'double' to parameter of incompatible type
+  // 'int'}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
new file mode 100644
index 00000000000000..3976302dba3eb9
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -finclude-default-header
+// -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only
+// -disable-llvm-passes -verify
+
+
+double test_int_builtin(double p0) {
+  return countbits(p0);
+  // expected-error at -1 {{call to 'countbits' is ambiguous}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
+}
+
+double2 test_int_builtin_2(double2 p0) {
+  return __builtin_elementwise_popcount(p0);
+  // expected-error at -1 {{1st argument must be a vector of integers
+  // (was 'double2' (aka 'vector<double, 2>'))}}
+}
+
+double test_int_builtin_3(float p0) {
+  return __builtin_elementwise_popcount(p0);
+  // expected-error at -1 {{1st argument must be a vector of integers
+  // (was 'float')}}
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/DirectX/countbits.ll b/llvm/test/CodeGen/DirectX/countbits.ll
index 9ebce58109e871..96206f9e41d176 100644
--- a/llvm/test/CodeGen/DirectX/countbits.ll
+++ b/llvm/test/CodeGen/DirectX/countbits.ll
@@ -2,26 +2,23 @@
 
 ; Make sure dxil operation function calls for countbits are generated for all integer types.
 
-; Function Attrs: nounwind
 define noundef i16 @test_countbits_short(i16 noundef %a) {
 entry:
-; CHECK:call i16 @dx.op.unary.i16(i32 31, i16 %{{.*}})
+; CHECK: call i16 @dx.op.unary.i16(i32 31, i16 %{{.*}})
   %elt.ctpop = call i16 @llvm.ctpop.i16(i16 %a)
   ret i16 %elt.ctpop
 }
 
-; Function Attrs: nounwind
 define noundef i32 @test_countbits_int(i32 noundef %a) {
 entry:
-; CHECK:call i32 @dx.op.unary.i32(i32 31, i32 %{{.*}})
+; CHECK: call i32 @dx.op.unary.i32(i32 31, i32 %{{.*}})
   %elt.ctpop = call i32 @llvm.ctpop.i32(i32 %a)
   ret i32 %elt.ctpop
 }
 
-; Function Attrs: nounwind
 define noundef i64 @test_countbits_long(i64 noundef %a) {
 entry:
-; CHECK:call i64 @dx.op.unary.i64(i32 31, i64 %{{.*}})
+; CHECK: call i64 @dx.op.unary.i64(i32 31, i64 %{{.*}})
   %elt.ctpop = call i64 @llvm.ctpop.i64(i64 %a)
   ret i64 %elt.ctpop
 }
diff --git a/llvm/test/CodeGen/DirectX/countbits_error.ll b/llvm/test/CodeGen/DirectX/countbits_error.ll
deleted file mode 100644
index e7adb103eaae7c..00000000000000
--- a/llvm/test/CodeGen/DirectX/countbits_error.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
-
-; DXIL operation ctpop does not support double overload type
-; CHECK: invalid intrinsic signature
-
-define noundef double @countbits_double(double noundef %a) {
-entry:
-  %elt.ctpop = call double @llvm.ctpop.f64(double %a)
-  ret double %elt.ctpop
-}

>From 33880520ab83c51e526dddf057f049493bcbb4c2 Mon Sep 17 00:00:00 2001
From: Sarah Spall <spall at planetbauer.com>
Date: Tue, 17 Sep 2024 21:47:46 +0000
Subject: [PATCH 3/3] address latest PR comments

---
 .../test/CodeGenHLSL/builtins/countbits.hlsl  | 30 +++++++++----------
 .../SemaHLSL/BuiltIns/countbits-errors.hlsl   | 10 ++-----
 llvm/test/CodeGen/DirectX/countbits.ll        | 21 ++++++++++++-
 3 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/clang/test/CodeGenHLSL/builtins/countbits.hlsl b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
index b4028433dd9637..8dfe977bfae626 100644
--- a/clang/test/CodeGenHLSL/builtins/countbits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
@@ -3,25 +3,25 @@
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 #ifdef __HLSL_ENABLE_16_BIT
-// CHECK: define noundef i16 @
-// CHECK: call i16 @llvm.ctpop.i16(
+// CHECK-LABEL: test_countbits_ushort
+// CHECK: call i16 @llvm.ctpop.i16
 uint16_t test_countbits_ushort(uint16_t p0)
 {
 	return countbits(p0);
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK-LABEL: test_countbits_ushort2
 // CHECK: call <2 x i16> @llvm.ctpop.v2i16
 uint16_t2 test_countbits_ushort2(uint16_t2 p0)
 {
 	return countbits(p0);
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK-LABEL: test_countbits_ushort3
 // CHECK: call <3 x i16> @llvm.ctpop.v3i16
 uint16_t3 test_countbits_ushort3(uint16_t3 p0)
 {
 	return countbits(p0);
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK-LABEL: test_countbits_ushort4
 // CHECK: call <4 x i16> @llvm.ctpop.v4i16
 uint16_t4 test_countbits_ushort4(uint16_t4 p0)
 {
@@ -29,50 +29,50 @@ uint16_t4 test_countbits_ushort4(uint16_t4 p0)
 }
 #endif
 
-// CHECK: define noundef i32 @
-// CHECK: call i32 @llvm.ctpop.i32(
+// CHECK-LABEL: test_countbits_uint
+// CHECK: call i32 @llvm.ctpop.i32
 int test_countbits_uint(uint p0)
 {
 	return countbits(p0);
 }
-// CHECK: define noundef <2 x i32> @
+// CHECK-LABEL: test_countbits_uint2
 // CHECK: call <2 x i32> @llvm.ctpop.v2i32
 uint2 test_countbits_uint2(uint2 p0)
 {
 	return countbits(p0);
 }
-// CHECK: define noundef <3 x i32> @
+// CHECK-LABEL: test_countbits_uint3
 // CHECK: call <3 x i32> @llvm.ctpop.v3i32
 uint3 test_countbits_uint3(uint3 p0)
 {
 	return countbits(p0);
 }
-// CHECK: define noundef <4 x i32> @
+// CHECK-LABEL: test_countbits_uint4
 // CHECK: call <4 x i32> @llvm.ctpop.v4i32
 uint4 test_countbits_uint4(uint4 p0)
 {
 	return countbits(p0);
 }
 
-// CHECK: define noundef i64 @
-// CHECK: call i64 @llvm.ctpop.i64(
+// CHECK-LABEL: test_countbits_long
+// CHECK: call i64 @llvm.ctpop.i64
 uint64_t test_countbits_long(uint64_t p0)
 {
 	return countbits(p0);
 }
-// CHECK: define noundef <2 x i64> @
+// CHECK-LABEL: test_countbits_long2
 // CHECK: call <2 x i64> @llvm.ctpop.v2i64
 uint64_t2 test_countbits_long2(uint64_t2 p0)
 {
 	return countbits(p0);
 }
-// CHECK: define noundef <3 x i64> @
+// CHECK-LABEL: test_countbits_long3
 // CHECK: call <3 x i64> @llvm.ctpop.v3i64
 uint64_t3 test_countbits_long3(uint64_t3 p0)
 {
 	return countbits(p0);
 }
-// CHECK: define noundef <4 x i64> @
+// CHECK-LABEL: test_countbits_long4
 // CHECK: call <4 x i64> @llvm.ctpop.v4i64
 uint64_t4 test_countbits_long4(uint64_t4 p0)
 {
diff --git a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
index 3976302dba3eb9..8d5f0abb2860f8 100644
--- a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
@@ -1,17 +1,11 @@
 // RUN: %clang_cc1 -finclude-default-header
 // -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only
-// -disable-llvm-passes -verify
+// -disable-llvm-passes -verify -verify-ignore-unexpected
 
 
 double test_int_builtin(double p0) {
   return countbits(p0);
   // expected-error at -1 {{call to 'countbits' is ambiguous}}
-  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
-  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
-  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
-  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
-  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
-  // expected-note at hlsl/hlsl_intrinsics.h:* {{candidate function}}
 }
 
 double2 test_int_builtin_2(double2 p0) {
@@ -24,4 +18,4 @@ double test_int_builtin_3(float p0) {
   return __builtin_elementwise_popcount(p0);
   // expected-error at -1 {{1st argument must be a vector of integers
   // (was 'float')}}
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/DirectX/countbits.ll b/llvm/test/CodeGen/DirectX/countbits.ll
index 96206f9e41d176..c6bc2b6790948e 100644
--- a/llvm/test/CodeGen/DirectX/countbits.ll
+++ b/llvm/test/CodeGen/DirectX/countbits.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for countbits are generated for all integer types.
 
@@ -23,6 +23,25 @@ entry:
   ret i64 %elt.ctpop
 }
 
+define noundef <4 x i32> @countbits_vec4_i32(<4 x i32> noundef %a)  {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x i32> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call i32 @dx.op.unary.i32(i32 31, i32 [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x i32> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call i32 @dx.op.unary.i32(i32 31, i32 [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x i32> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call i32 @dx.op.unary.i32(i32 31, i32 [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x i32> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call i32 @dx.op.unary.i32(i32 31, i32 [[ee3]])
+  ; CHECK: insertelement <4 x i32> poison, i32 [[ie0]], i64 0
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie1]], i64 1
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie2]], i64 2
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie3]], i64 3
+  %2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+  ret <4 x i32> %2
+}
+
 declare i16 @llvm.ctpop.i16(i16)
 declare i32 @llvm.ctpop.i32(i32)
 declare i64 @llvm.ctpop.i64(i64)
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)