[llvm] [AArch64] Expand llvm.histogram intrinsic to support umax, umin, and uadd.sat operations (PR #138447)
via llvm-commits
llvm-commits at lists.llvm.org
Sun May 4 03:40:21 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir
Author: None (RonDahan101)
<details>
<summary>Changes</summary>
This patch extends the llvm.histogram intrinsic to support additional update operations beyond the existing add. Specifically, the new supported operations are:
* umax: unsigned maximum
* umin: unsigned minimum
* uadd.sat: unsigned saturated addition
Based on the discussion from:
https://discourse.llvm.org/t/rfc-expanding-the-experimental-histogram-intrinsic/84673
---
Full diff: https://github.com/llvm/llvm-project/pull/138447.diff
4 Files Affected:
- (modified) llvm/docs/LangRef.rst (+5-3)
- (modified) llvm/include/llvm/IR/Intrinsics.td (+18)
- (modified) llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp (+23-2)
- (modified) llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll (+342)
``````````diff
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index deb87365ae8d7..fe2c9f3bc8091 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2932,9 +2932,8 @@ the behavior is undefined, unless one of the following exceptions applies:
must be a null pointer, otherwise the behavior is undefined.
* ``dereferenceable(<n>)`` operand bundles only guarantee the pointer is
- dereferenceable at the point of the assumption. The pointer may not be
- dereferenceable at later pointers, e.g. because it could have been
- freed.
+ dereferenceable at the point of the assumption. The pointer may not be
+ dereferenceable at later pointers, e.g. because it could have been freed.
In addition to allowing operand bundles encoding function and parameter
attributes, an assume operand bundle my also encode a ``separate_storage``
@@ -20295,6 +20294,9 @@ More update operation types may be added in the future.
declare void @llvm.experimental.vector.histogram.add.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
declare void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %ptrs, i64 %inc, <vscale x 2 x i1> %mask)
+ declare void @llvm.experimental.vector.histogram.uadd.sat.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
+ declare void @llvm.experimental.vector.histogram.umax.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask)
+ declare void @llvm.experimental.vector.histogram.umin.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask)
Arguments:
""""""""""
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 14ecae41ff08f..31a0ba2e6500d 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1947,6 +1947,24 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
[ IntrArgMemOnly ]>;
+def int_experimental_vector_histogram_uadd_sat : DefaultAttrsIntrinsic<[],
+ [ llvm_anyvector_ty, // Vector of pointers
+ llvm_anyint_ty, // Increment
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+ [ IntrArgMemOnly ]>;
+
+def int_experimental_vector_histogram_umin : DefaultAttrsIntrinsic<[],
+ [ llvm_anyvector_ty, // Vector of pointers
+ llvm_anyint_ty, // Update value
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+ [ IntrArgMemOnly ]>;
+
+def int_experimental_vector_histogram_umax : DefaultAttrsIntrinsic<[],
+ [ llvm_anyvector_ty, // Vector of pointers
+ llvm_anyint_ty, // Update value
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+ [ IntrArgMemOnly ]>;
+
// Experimental match
def int_experimental_vector_match : DefaultAttrsIntrinsic<
[ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 63fcc1760ccaf..aa7fb5a8d780a 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -997,8 +997,26 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI,
Builder.SetInsertPoint(CondBlock->getTerminator());
Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx));
- Value *Add = Builder.CreateAdd(Load, Inc);
- Builder.CreateStore(Add, Ptr);
+ Value *UpdateOp;
+ switch (cast<IntrinsicInst>(CI)->getIntrinsicID()) {
+ case Intrinsic::experimental_vector_histogram_add:
+ UpdateOp = Builder.CreateAdd(Load, Inc);
+ break;
+ case Intrinsic::experimental_vector_histogram_uadd_sat:
+ UpdateOp =
+ Builder.CreateIntrinsic(Intrinsic::uadd_sat, {EltTy}, {Load, Inc});
+ break;
+ case Intrinsic::experimental_vector_histogram_umin:
+ UpdateOp = Builder.CreateIntrinsic(Intrinsic::umin, {EltTy}, {Load, Inc});
+ break;
+ case Intrinsic::experimental_vector_histogram_umax:
+ UpdateOp = Builder.CreateIntrinsic(Intrinsic::umax, {EltTy}, {Load, Inc});
+ break;
+
+ default:
+ llvm_unreachable("Unexpected histogram intrinsic");
+ }
+ Builder.CreateStore(UpdateOp, Ptr);
// Create "else" block, fill it in the next iteration
BasicBlock *NewIfBlock = ThenTerm->getSuccessor(0);
@@ -1089,6 +1107,9 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
default:
break;
case Intrinsic::experimental_vector_histogram_add:
+ case Intrinsic::experimental_vector_histogram_uadd_sat:
+ case Intrinsic::experimental_vector_histogram_umin:
+ case Intrinsic::experimental_vector_histogram_umax:
if (TTI.isLegalMaskedVectorHistogram(CI->getArgOperand(0)->getType(),
CI->getArgOperand(1)->getType()))
return false;
diff --git a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
index e59d9098a30d6..55d48cb40c6b1 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
@@ -112,3 +112,345 @@ define void @histogram_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
ret void
}
+
+define void @histogram_uadd_sat_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) {
+; CHECK-LABEL: histogram_uadd_sat_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: tbnz w8, #0, .LBB3_3
+; CHECK-NEXT: // %bb.1: // %else
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: tbnz w8, #0, .LBB3_4
+; CHECK-NEXT: .LBB3_2: // %else2
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB3_3: // %cond.histogram.update
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: ldr x9, [x8]
+; CHECK-NEXT: adds x9, x9, x0
+; CHECK-NEXT: csinv x9, x9, xzr, lo
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: tbz w8, #0, .LBB3_2
+; CHECK-NEXT: .LBB3_4: // %cond.histogram.update1
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: ldr x9, [x8]
+; CHECK-NEXT: adds x9, x9, x0
+; CHECK-NEXT: csinv x9, x9, xzr, lo
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vector.histogram.uadd.sat.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask)
+ ret void
+}
+
+define void @histogram_uadd_sat_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) {
+; CHECK-LABEL: histogram_uadd_sat_i32_literal:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v2.2d, x0
+; CHECK-NEXT: sshll v3.2d, v0.2s, #2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: umov w8, v1.h[0]
+; CHECK-NEXT: add v3.2d, v2.2d, v3.2d
+; CHECK-NEXT: tbz w8, #0, .LBB4_2
+; CHECK-NEXT: // %bb.1: // %cond.histogram.update
+; CHECK-NEXT: fmov x8, d3
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: .LBB4_2: // %else
+; CHECK-NEXT: umov w8, v1.h[1]
+; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT: tbz w8, #0, .LBB4_4
+; CHECK-NEXT: // %bb.3: // %cond.histogram.update1
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: .LBB4_4: // %else2
+; CHECK-NEXT: umov w8, v1.h[2]
+; CHECK-NEXT: add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT: tbnz w8, #0, .LBB4_7
+; CHECK-NEXT: // %bb.5: // %else4
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: tbnz w8, #0, .LBB4_8
+; CHECK-NEXT: .LBB4_6: // %else6
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB4_7: // %cond.histogram.update3
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: tbz w8, #0, .LBB4_6
+; CHECK-NEXT: .LBB4_8: // %cond.histogram.update5
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: ret
+ %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_uadd_sat_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
+; CHECK-LABEL: histogram_uadd_sat_i32_literal_alltruemask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v1.2d, x0
+; CHECK-NEXT: sshll v2.2d, v0.2s, #2
+; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT: add v2.2d, v1.2d, v2.2d
+; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: mov x9, v2.d[1]
+; CHECK-NEXT: ldr w10, [x8]
+; CHECK-NEXT: add w10, w10, #1
+; CHECK-NEXT: str w10, [x8]
+; CHECK-NEXT: ldr w8, [x9]
+; CHECK-NEXT: add w8, w8, #1
+; CHECK-NEXT: str w8, [x9]
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov x9, v0.d[1]
+; CHECK-NEXT: ldr w10, [x8]
+; CHECK-NEXT: add w10, w10, #1
+; CHECK-NEXT: str w10, [x8]
+; CHECK-NEXT: ldr w8, [x9]
+; CHECK-NEXT: add w8, w8, #1
+; CHECK-NEXT: str w8, [x9]
+; CHECK-NEXT: ret
+ %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define void @histogram_umax_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) {
+; CHECK-LABEL: histogram_umax_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: tbnz w8, #0, .LBB6_3
+; CHECK-NEXT: // %bb.1: // %else
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: tbnz w8, #0, .LBB6_4
+; CHECK-NEXT: .LBB6_2: // %else2
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB6_3: // %cond.histogram.update
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: ldr x9, [x8]
+; CHECK-NEXT: adds x9, x9, x0
+; CHECK-NEXT: csinv x9, x9, xzr, lo
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: tbz w8, #0, .LBB6_2
+; CHECK-NEXT: .LBB6_4: // %cond.histogram.update1
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: ldr x9, [x8]
+; CHECK-NEXT: adds x9, x9, x0
+; CHECK-NEXT: csinv x9, x9, xzr, lo
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vector.histogram.uadd.sat.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask)
+ ret void
+}
+
+define void @histogram_umax_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) {
+; CHECK-LABEL: histogram_umax_i32_literal:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v2.2d, x0
+; CHECK-NEXT: sshll v3.2d, v0.2s, #2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: umov w8, v1.h[0]
+; CHECK-NEXT: add v3.2d, v2.2d, v3.2d
+; CHECK-NEXT: tbz w8, #0, .LBB7_2
+; CHECK-NEXT: // %bb.1: // %cond.histogram.update
+; CHECK-NEXT: fmov x8, d3
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: .LBB7_2: // %else
+; CHECK-NEXT: umov w8, v1.h[1]
+; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT: tbz w8, #0, .LBB7_4
+; CHECK-NEXT: // %bb.3: // %cond.histogram.update1
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: .LBB7_4: // %else2
+; CHECK-NEXT: umov w8, v1.h[2]
+; CHECK-NEXT: add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT: tbnz w8, #0, .LBB7_7
+; CHECK-NEXT: // %bb.5: // %else4
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: tbnz w8, #0, .LBB7_8
+; CHECK-NEXT: .LBB7_6: // %else6
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB7_7: // %cond.histogram.update3
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: tbz w8, #0, .LBB7_6
+; CHECK-NEXT: .LBB7_8: // %cond.histogram.update5
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: ret
+ %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_umax_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
+; CHECK-LABEL: histogram_umax_i32_literal_alltruemask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v1.2d, x0
+; CHECK-NEXT: sshll v2.2d, v0.2s, #2
+; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT: add v2.2d, v1.2d, v2.2d
+; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: mov x9, v2.d[1]
+; CHECK-NEXT: ldr w10, [x8]
+; CHECK-NEXT: add w10, w10, #1
+; CHECK-NEXT: str w10, [x8]
+; CHECK-NEXT: ldr w8, [x9]
+; CHECK-NEXT: add w8, w8, #1
+; CHECK-NEXT: str w8, [x9]
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov x9, v0.d[1]
+; CHECK-NEXT: ldr w10, [x8]
+; CHECK-NEXT: add w10, w10, #1
+; CHECK-NEXT: str w10, [x8]
+; CHECK-NEXT: ldr w8, [x9]
+; CHECK-NEXT: add w8, w8, #1
+; CHECK-NEXT: str w8, [x9]
+; CHECK-NEXT: ret
+ %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+
+define void @histogram_umin_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) {
+; CHECK-LABEL: histogram_umin_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: tbnz w8, #0, .LBB9_3
+; CHECK-NEXT: // %bb.1: // %else
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: tbnz w8, #0, .LBB9_4
+; CHECK-NEXT: .LBB9_2: // %else2
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB9_3: // %cond.histogram.update
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: ldr x9, [x8]
+; CHECK-NEXT: adds x9, x9, x0
+; CHECK-NEXT: csinv x9, x9, xzr, lo
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: tbz w8, #0, .LBB9_2
+; CHECK-NEXT: .LBB9_4: // %cond.histogram.update1
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: ldr x9, [x8]
+; CHECK-NEXT: adds x9, x9, x0
+; CHECK-NEXT: csinv x9, x9, xzr, lo
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vector.histogram.uadd.sat.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask)
+ ret void
+}
+
+define void @histogram_umin_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) {
+; CHECK-LABEL: histogram_umin_i32_literal:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v2.2d, x0
+; CHECK-NEXT: sshll v3.2d, v0.2s, #2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: umov w8, v1.h[0]
+; CHECK-NEXT: add v3.2d, v2.2d, v3.2d
+; CHECK-NEXT: tbz w8, #0, .LBB10_2
+; CHECK-NEXT: // %bb.1: // %cond.histogram.update
+; CHECK-NEXT: fmov x8, d3
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: .LBB10_2: // %else
+; CHECK-NEXT: umov w8, v1.h[1]
+; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT: tbz w8, #0, .LBB10_4
+; CHECK-NEXT: // %bb.3: // %cond.histogram.update1
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: .LBB10_4: // %else2
+; CHECK-NEXT: umov w8, v1.h[2]
+; CHECK-NEXT: add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT: tbnz w8, #0, .LBB10_7
+; CHECK-NEXT: // %bb.5: // %else4
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: tbnz w8, #0, .LBB10_8
+; CHECK-NEXT: .LBB10_6: // %else6
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB10_7: // %cond.histogram.update3
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: tbz w8, #0, .LBB10_6
+; CHECK-NEXT: .LBB10_8: // %cond.histogram.update5
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: ldr w9, [x8]
+; CHECK-NEXT: adds w9, w9, #1
+; CHECK-NEXT: csinv w9, w9, wzr, lo
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: ret
+ %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_umin_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
+; CHECK-LABEL: histogram_umin_i32_literal_alltruemask:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v1.2d, x0
+; CHECK-NEXT: sshll v2.2d, v0.2s, #2
+; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT: add v2.2d, v1.2d, v2.2d
+; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: mov x9, v2.d[1]
+; CHECK-NEXT: ldr w10, [x8]
+; CHECK-NEXT: add w10, w10, #1
+; CHECK-NEXT: str w10, [x8]
+; CHECK-NEXT: ldr w8, [x9]
+; CHECK-NEXT: add w8, w8, #1
+; CHECK-NEXT: str w8, [x9]
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov x9, v0.d[1]
+; CHECK-NEXT: ldr w10, [x8]
+; CHECK-NEXT: add w10, w10, #1
+; CHECK-NEXT: str w10, [x8]
+; CHECK-NEXT: ldr w8, [x9]
+; CHECK-NEXT: add w8, w8, #1
+; CHECK-NEXT: str w8, [x9]
+; CHECK-NEXT: ret
+ %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+ call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/138447
More information about the llvm-commits
mailing list