[llvm] [X86] X86TargetLowering::computeKnownBitsForTargetNode - add X86ISD::VPMADD52L\H handling - again (PR #159230)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 21 20:25:02 PDT 2025
https://github.com/houngkoungting updated https://github.com/llvm/llvm-project/pull/159230
>From 80e303c6e0976d8c2437a806679a54d5919c5917 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Wed, 6 Aug 2025 16:17:48 +0800
Subject: [PATCH 01/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits-1
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 +++++++++++++++++++
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 43 ++++++++++++++++++
2 files changed, 88 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d70e96938ed9a..9ff256f8090ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16294,6 +16294,51 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
// because targets may prefer a wider type during later combines and invert
// this transform.
switch (N0.getOpcode()) {
+ case ISD::AVGCEILU:
+ case ISD::AVGFLOORU:
+ if (!LegalOperations && N0.hasOneUse() &&
+ TLI.isOperationLegal(N0.getOpcode(), VT)) {
+ SDValue X = N0.getOperand(0);
+ SDValue Y = N0.getOperand(1);
+
+ KnownBits KnownX = DAG.computeKnownBits(X);
+ KnownBits KnownY = DAG.computeKnownBits(Y);
+
+ unsigned SrcBits = X.getScalarValueSizeInBits();
+ unsigned DstBits = VT.getScalarSizeInBits();
+ unsigned NeededLeadingZeros = SrcBits - DstBits + 1;
+
+ if (KnownX.countMinLeadingZeros() >= NeededLeadingZeros &&
+ KnownY.countMinLeadingZeros() >= NeededLeadingZeros) {
+ SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+ SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
+ return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
+ }
+ }
+ break;
+
+ case ISD::AVGCEILS:
+ case ISD::AVGFLOORS:
+ if (!LegalOperations && N0.hasOneUse() &&
+ TLI.isOperationLegal(N0.getOpcode(), VT)) {
+ SDValue X = N0.getOperand(0);
+ SDValue Y = N0.getOperand(1);
+
+ unsigned SignBitsX = DAG.ComputeNumSignBits(X);
+ unsigned SignBitsY = DAG.ComputeNumSignBits(Y);
+
+ unsigned SrcBits = X.getScalarValueSizeInBits();
+ unsigned DstBits = VT.getScalarSizeInBits();
+ unsigned NeededSignBits = SrcBits - DstBits + 1;
+
+ if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
+ SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+ SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
+ return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
+ }
+ }
+ break;
+
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
new file mode 100644
index 0000000000000..175f54d6f9c05
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
+
+; CHECK-LABEL: test_avgceil_u
+; CHECK: uhadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
+ %ta = trunc <8 x i16> %a to <8 x i8>
+ %tb = trunc <8 x i16> %b to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+ ret <8 x i8> %res
+}
+
+; CHECK-LABEL: test_avgceil_s
+; CHECK: shadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
+ %ta = trunc <8 x i16> %a to <8 x i8>
+ %tb = trunc <8 x i16> %b to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+ ret <8 x i8> %res
+}
+
+; CHECK-LABEL: test_avgfloor_u
+; CHECK: urhadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
+ %ta = trunc <8 x i16> %a to <8 x i8>
+ %tb = trunc <8 x i16> %b to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+ ret <8 x i8> %res
+}
+
+; CHECK-LABEL: test_avgfloor_s
+; CHECK: srhadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
+ %ta = trunc <8 x i16> %a to <8 x i8>
+ %tb = trunc <8 x i16> %b to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+ ret <8 x i8> %res
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
+
>From 24287f7f08d3bd238761b6e798aee655af931c3d Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 8 Aug 2025 00:04:32 +0800
Subject: [PATCH 02/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits -2
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 +---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 96 +++++++++++++++----
2 files changed, 79 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9ff256f8090ba..0cba06215d3fe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16300,37 +16300,28 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
TLI.isOperationLegal(N0.getOpcode(), VT)) {
SDValue X = N0.getOperand(0);
SDValue Y = N0.getOperand(1);
-
- KnownBits KnownX = DAG.computeKnownBits(X);
- KnownBits KnownY = DAG.computeKnownBits(Y);
-
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
- unsigned NeededLeadingZeros = SrcBits - DstBits + 1;
-
- if (KnownX.countMinLeadingZeros() >= NeededLeadingZeros &&
- KnownY.countMinLeadingZeros() >= NeededLeadingZeros) {
+ unsigned MaxBitsX = DAG.ComputeMaxSignificantBits(X);
+ unsigned MaxBitsY = DAG.ComputeMaxSignificantBits(Y);
+ if (MaxBitsX <= DstBits && MaxBitsY <= DstBits) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
}
}
break;
-
case ISD::AVGCEILS:
case ISD::AVGFLOORS:
if (!LegalOperations && N0.hasOneUse() &&
TLI.isOperationLegal(N0.getOpcode(), VT)) {
SDValue X = N0.getOperand(0);
SDValue Y = N0.getOperand(1);
-
unsigned SignBitsX = DAG.ComputeNumSignBits(X);
unsigned SignBitsY = DAG.ComputeNumSignBits(Y);
-
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
unsigned NeededSignBits = SrcBits - DstBits + 1;
-
if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
@@ -16338,7 +16329,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
}
}
break;
-
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index 175f54d6f9c05..db40746776d43 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -1,38 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
-; CHECK-LABEL: test_avgceil_u
-; CHECK: uhadd v0.8b, v0.8b, v1.8b
+
define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
- %ta = trunc <8 x i16> %a to <8 x i8>
- %tb = trunc <8 x i16> %b to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgceil_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+
+ %mask = insertelement <8 x i16> undef, i16 255, i32 0
+ %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+ %ta16 = and <8 x i16> %a, %mask.splat
+ %tb16 = and <8 x i16> %b, %mask.splat
+ %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
+ %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-; CHECK-LABEL: test_avgceil_s
-; CHECK: shadd v0.8b, v0.8b, v1.8b
+
define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
- %ta = trunc <8 x i16> %a to <8 x i8>
- %tb = trunc <8 x i16> %b to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgceil_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqxtn v0.8b, v0.8h
+; CHECK-NEXT: sqxtn v1.8b, v1.8h
+; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+
+ %smin = insertelement <8 x i16> undef, i16 -128, i32 0
+ %smax = insertelement <8 x i16> undef, i16 127, i32 0
+ %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
+ %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
+
+ %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
+ %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-; CHECK-LABEL: test_avgfloor_u
-; CHECK: urhadd v0.8b, v0.8b, v1.8b
+
define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
- %ta = trunc <8 x i16> %a to <8 x i8>
- %tb = trunc <8 x i16> %b to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgfloor_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+
+ %mask = insertelement <8 x i16> undef, i16 255, i32 0
+ %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+ %ta16 = and <8 x i16> %a, %mask.splat
+ %tb16 = and <8 x i16> %b, %mask.splat
+ %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
+ %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-; CHECK-LABEL: test_avgfloor_s
-; CHECK: srhadd v0.8b, v0.8b, v1.8b
+
define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
- %ta = trunc <8 x i16> %a to <8 x i8>
- %tb = trunc <8 x i16> %b to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgfloor_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqxtn v0.8b, v0.8h
+; CHECK-NEXT: sqxtn v1.8b, v1.8h
+; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+
+ %smin = insertelement <8 x i16> undef, i16 -128, i32 0
+ %smax = insertelement <8 x i16> undef, i16 127, i32 0
+ %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
+ %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
+
+ %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
+ %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
@@ -41,3 +94,6 @@ declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
+
>From c8cc2a98030154d6a95154d8fe8d7461cfb0daf4 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 8 Aug 2025 10:51:33 +0800
Subject: [PATCH 03/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits -3
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 +++++---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 20 +++++++++----------
2 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0cba06215d3fe..7aea288c03208 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16302,9 +16302,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
SDValue Y = N0.getOperand(1);
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
- unsigned MaxBitsX = DAG.ComputeMaxSignificantBits(X);
- unsigned MaxBitsY = DAG.ComputeMaxSignificantBits(Y);
- if (MaxBitsX <= DstBits && MaxBitsY <= DstBits) {
+ KnownBits KnownX = DAG.computeKnownBits(X);
+ KnownBits KnownY = DAG.computeKnownBits(Y);
+ if (KnownX.countMinLeadingZeros() >= (SrcBits - DstBits) &&
+ KnownY.countMinLeadingZeros() >= (SrcBits - DstBits)) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
@@ -16322,6 +16323,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
unsigned NeededSignBits = SrcBits - DstBits + 1;
+
if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index db40746776d43..ede39e237a9c9 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -10,8 +10,8 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %mask = insertelement <8 x i16> undef, i16 255, i32 0
- %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+ %mask = insertelement <8 x i16> poison, i16 255, i32 0
+ %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
%tb16 = and <8 x i16> %b, %mask.splat
%ta8 = trunc <8 x i16> %ta16 to <8 x i8>
@@ -29,10 +29,10 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %smin = insertelement <8 x i16> undef, i16 -128, i32 0
- %smax = insertelement <8 x i16> undef, i16 127, i32 0
- %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
- %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+ %smin = insertelement <8 x i16> poison, i16 -128, i32 0
+ %smax = insertelement <8 x i16> poison, i16 127, i32 0
+ %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
+ %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
@@ -73,10 +73,10 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %smin = insertelement <8 x i16> undef, i16 -128, i32 0
- %smax = insertelement <8 x i16> undef, i16 127, i32 0
- %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
- %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+ %smin = insertelement <8 x i16> poison, i16 -128, i32 0
+ %smax = insertelement <8 x i16> poison, i16 127, i32 0
+ %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
+ %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
>From 11152562f1255a4fcd60404d1e08ca80bf422090 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 8 Aug 2025 11:40:46 +0800
Subject: [PATCH 04/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits-4
---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 59 ++++++++-------------
1 file changed, 22 insertions(+), 37 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index ede39e237a9c9..4d4e828a751bd 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
-
define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgceil_u:
; CHECK: // %bb.0:
@@ -9,7 +8,6 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: xtn v1.8b, v1.8h
; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
-
%mask = insertelement <8 x i16> poison, i16 255, i32 0
%mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
@@ -20,7 +18,6 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
ret <8 x i8> %res
}
-
define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgceil_s:
; CHECK: // %bb.0:
@@ -28,72 +25,60 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: sqxtn v1.8b, v1.8h
; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
-
- %smin = insertelement <8 x i16> poison, i16 -128, i32 0
- %smax = insertelement <8 x i16> poison, i16 127, i32 0
- %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
- %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
-
- %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
- %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
- %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
- %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
-
+ %min = insertelement <8 x i16> poison, i16 -128, i32 0
+ %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
+ %max = insertelement <8 x i16> poison, i16 127, i32 0
+ %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat)
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
%ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
%tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
%res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-
define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgfloor_u:
; CHECK: // %bb.0:
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
-
- %mask = insertelement <8 x i16> undef, i16 255, i32 0
- %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+ %mask = insertelement <8 x i16> poison, i16 255, i32 0
+ %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
%tb16 = and <8 x i16> %b, %mask.splat
%ta8 = trunc <8 x i16> %ta16 to <8 x i8>
%tb8 = trunc <8 x i16> %tb16 to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-
define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgfloor_s:
; CHECK: // %bb.0:
; CHECK-NEXT: sqxtn v0.8b, v0.8h
; CHECK-NEXT: sqxtn v1.8b, v1.8h
-; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
-
- %smin = insertelement <8 x i16> poison, i16 -128, i32 0
- %smax = insertelement <8 x i16> poison, i16 127, i32 0
- %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
- %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
-
- %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
- %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
- %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
- %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
-
+ %min = insertelement <8 x i16> poison, i16 -128, i32 0
+ %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
+ %max = insertelement <8 x i16> poison, i16 127, i32 0
+ %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat)
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
%ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
%tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>)
declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
-
declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
>From 08138a2fde9896a580d11a2b4249eea86d42fefe Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 8 Aug 2025 12:55:44 +0800
Subject: [PATCH 05/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits-5
---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 52 ++++++++++++---------
1 file changed, 30 insertions(+), 22 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index 4d4e828a751bd..36fddedd78df6 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -4,26 +4,31 @@
define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgceil_u:
; CHECK: // %bb.0:
+; CHECK-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%mask = insertelement <8 x i16> poison, i16 255, i32 0
%mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
%tb16 = and <8 x i16> %b, %mask.splat
- %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
- %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgceil_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqxtn v0.8b, v0.8h
-; CHECK-NEXT: sqxtn v1.8b, v1.8h
-; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: movi v2.8h, #127
+; CHECK-NEXT: mvni v3.8h, #127
+; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
+; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
+; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%min = insertelement <8 x i16> poison, i16 -128, i32 0
%min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -33,35 +38,39 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
%tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
%tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
- %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
- %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgfloor_u:
; CHECK: // %bb.0:
+; CHECK-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%mask = insertelement <8 x i16> poison, i16 255, i32 0
%mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
%tb16 = and <8 x i16> %b, %mask.splat
- %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
- %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgfloor_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqxtn v0.8b, v0.8h
-; CHECK-NEXT: sqxtn v1.8b, v1.8h
-; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: movi v2.8h, #127
+; CHECK-NEXT: mvni v3.8h, #127
+; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
+; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
+; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%min = insertelement <8 x i16> poison, i16 -128, i32 0
%min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -71,9 +80,8 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
%tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
%tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
- %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
- %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
>From 728b37db85a9821aec9931af00a8338ae9d7c95e Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 8 Aug 2025 13:05:41 +0800
Subject: [PATCH 06/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits-6
---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index 36fddedd78df6..24a1e6f60c078 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -85,8 +85,8 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
ret <8 x i8> %res
}
-declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>)
declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
>From 44609a3b749675b758f1030b9401497192491dd4 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Sat, 9 Aug 2025 21:13:30 +0800
Subject: [PATCH 07/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits-7
---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 36 +++++++--------------
1 file changed, 12 insertions(+), 24 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index 24a1e6f60c078..ca8e713cafc13 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -9,10 +9,8 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
- %mask = insertelement <8 x i16> poison, i16 255, i32 0
- %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
- %ta16 = and <8 x i16> %a, %mask.splat
- %tb16 = and <8 x i16> %b, %mask.splat
+ %ta16 = and <8 x i16> %a, splat (i16 255)
+ %tb16 = and <8 x i16> %b, splat (i16 255)
%avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
%res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
@@ -30,14 +28,10 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
- %min = insertelement <8 x i16> poison, i16 -128, i32 0
- %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
- %max = insertelement <8 x i16> poison, i16 127, i32 0
- %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer
- %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat)
- %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
- %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
- %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127))
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128))
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127))
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128))
%avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
%res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
@@ -51,10 +45,8 @@ define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
- %mask = insertelement <8 x i16> poison, i16 255, i32 0
- %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
- %ta16 = and <8 x i16> %a, %mask.splat
- %tb16 = and <8 x i16> %b, %mask.splat
+ %ta16 = and <8 x i16> %a, splat (i16 255)
+ %tb16 = and <8 x i16> %b, splat (i16 255)
%avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
%res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
@@ -72,14 +64,10 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
- %min = insertelement <8 x i16> poison, i16 -128, i32 0
- %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
- %max = insertelement <8 x i16> poison, i16 127, i32 0
- %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer
- %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat)
- %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
- %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
- %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127))
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128))
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127))
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128))
%avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
%res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
>From 2d268fc6bd5de28d1dd6adbabc732e475a530014 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Sun, 17 Aug 2025 00:09:15 +0800
Subject: [PATCH 08/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits-8
---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 81 +++++++--------------
1 file changed, 27 insertions(+), 54 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index ca8e713cafc13..8d9ea6c9d9922 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -1,80 +1,53 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
-define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_avgceil_u:
+define <8 x i8> @avgceil_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: avgceil_u_i8_to_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %ta16 = and <8 x i16> %a, splat (i16 255)
- %tb16 = and <8 x i16> %b, splat (i16 255)
- %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
- %res = trunc <8 x i16> %avg16 to <8 x i8>
- ret <8 x i8> %res
+ %a16 = zext <8 x i8> %a to <8 x i16>
+ %b16 = zext <8 x i8> %b to <8 x i16>
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
+ %r = trunc <8 x i16> %avg16 to <8 x i8>
+ ret <8 x i8> %r
}
-define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
+
+define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: test_avgceil_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.8h, #127
-; CHECK-NEXT: mvni v3.8h, #127
-; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
-; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
-; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
-; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127))
- %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128))
- %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127))
- %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128))
- %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
- %res = trunc <8 x i16> %avg16 to <8 x i8>
+ %a16 = sext <8 x i8> %a to <8 x i16>
+ %b16 = sext <8 x i8> %b to <8 x i16>
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
-define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_avgfloor_u:
+define <8 x i8> @avgfloor_u_from_intrin(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: avgfloor_u_from_intrin:
; CHECK: // %bb.0:
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %ta16 = and <8 x i16> %a, splat (i16 255)
- %tb16 = and <8 x i16> %b, splat (i16 255)
- %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
+ %a16 = zext <8 x i8> %a to <8 x i16>
+ %b16 = zext <8 x i8> %b to <8 x i16>
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
%res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
-define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
+define <8 x i8> @test_avgfloor_s(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: test_avgfloor_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.8h, #127
-; CHECK-NEXT: mvni v3.8h, #127
-; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
-; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
-; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
-; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 127))
- %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> splat (i16 -128))
- %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> splat (i16 127))
- %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> splat (i16 -128))
- %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
- %res = trunc <8 x i16> %avg16 to <8 x i8>
+ %a16 = sext <8 x i8> %a to <8 x i16>
+ %b16 = sext <8 x i8> %b to <8 x i16>
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %a16, <8 x i16> %b16)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
-declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
>From 32041fbb0b9696b8ab59feab66354aad96e4b1f7 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Sun, 17 Aug 2025 00:10:04 +0800
Subject: [PATCH 09/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits-9
---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index 8d9ea6c9d9922..030e9ea994264 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -26,7 +26,7 @@ define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) {
ret <8 x i8> %res
}
-define <8 x i8> @avgfloor_u_from_intrin(<8 x i8> %a, <8 x i8> %b) {
+define <8 x i8> @avgfloor_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: avgfloor_u_from_intrin:
; CHECK: // %bb.0:
; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
>From 4e1af14d3efaed8c47448a158f547bdcd47879b3 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Sun, 17 Aug 2025 23:32:43 +0800
Subject: [PATCH 10/15] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if
they have sufficient leading zero/sign bits-10
---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index 030e9ea994264..54fcae4ba28b7 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -27,7 +27,7 @@ define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) {
}
define <8 x i8> @avgfloor_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: avgfloor_u_from_intrin:
+; CHECK-LABEL: avgfloor_u_i8_to_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
>From c4ea7bdf7df0749e30479967d7643b363df43bf7 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Mon, 18 Aug 2025 21:05:31 +0800
Subject: [PATCH 11/15] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes
if they have sufficient leading zero/sign bits-11
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7aea288c03208..738aa96b729ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16302,10 +16302,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
SDValue Y = N0.getOperand(1);
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
- KnownBits KnownX = DAG.computeKnownBits(X);
- KnownBits KnownY = DAG.computeKnownBits(Y);
- if (KnownX.countMinLeadingZeros() >= (SrcBits - DstBits) &&
- KnownY.countMinLeadingZeros() >= (SrcBits - DstBits)) {
+ APInt UpperBits = APInt::getBitsSetFrom(SrcBits, DstBits);
+ if (DAG.MaskedValueIsZero(X, UpperBits) &&
+ DAG.MaskedValueIsZero(Y, UpperBits)) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
@@ -16318,13 +16317,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
TLI.isOperationLegal(N0.getOpcode(), VT)) {
SDValue X = N0.getOperand(0);
SDValue Y = N0.getOperand(1);
- unsigned SignBitsX = DAG.ComputeNumSignBits(X);
- unsigned SignBitsY = DAG.ComputeNumSignBits(Y);
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
unsigned NeededSignBits = SrcBits - DstBits + 1;
-
- if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
+ if (DAG.ComputeNumSignBits(X) >= NeededSignBits &&
+ DAG.ComputeNumSignBits(Y) >= NeededSignBits) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
>From fac54fffd2fc76a4523bb26008e2e2b5a37c0a16 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Wed, 17 Sep 2025 10:23:43 +0800
Subject: [PATCH 12/15] [X86] X86TargetLowering::computeKnownBitsForTargetNode
- add X86ISD::VPMADD52L\H handling-1
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 20 +++
llvm/test/CodeGen/X86/knownbits-vpmadd52.ll | 138 ++++++++++++++++++++
2 files changed, 158 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/knownbits-vpmadd52.ll
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f81efdc6414aa..b345a57d46863 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38999,6 +38999,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
+ case X86ISD::VPMADD52L:
+ case X86ISD::VPMADD52H: {
+ assert(Op.getValueType().isVector() &&
+ Op.getValueType().getScalarType() == MVT::i64 &&
+ "Unexpected VPMADD52 type");
+ KnownBits K0 =
+ DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ KnownBits K1 =
+ DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ KnownBits KAcc =
+ DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
+ K0 = K0.trunc(52);
+ K1 = K1.trunc(52);
+ KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
+ ? KnownBits::mul(K0, K1)
+ : KnownBits::mulhu(K0, K1);
+ KnownMul = KnownMul.zext(64);
+ Known = KnownBits::add(KAcc, KnownMul);
+ return;
+ }
}
// Handle target shuffles.
diff --git a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll
new file mode 100644
index 0000000000000..0b5be5fc9900b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
+
+
+
+; H path: take the high 52 bits of the product and add them to the accumulator
+; 25-bit = (1<<25)-1 = 33554431
+; 26-bit = (1<<26)-1 = 67108863
+
+declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
+
+define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) {
+; AVX512VL-LABEL: kb52h_128_mask25_and1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = [1,1]
+; AVX512VL-NEXT: # xmm0 = mem[0,0]
+; AVX512VL-NEXT: retq
+ %mx = and <2 x i64> %x, <i64 33554431, i64 33554431>
+ %my = and <2 x i64> %y, <i64 33554431, i64 33554431>
+ %r = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(
+ <2 x i64> <i64 1, i64 1>, ; acc
+ <2 x i64> %mx, ; x (masked to 25-bit)
+ <2 x i64> %my) ; y (masked to 25-bit)
+ %ret = and <2 x i64> %r, <i64 1, i64 1>
+ ret <2 x i64> %ret
+}
+
+define <4 x i64> @kb52h_256_mask25x26_acc1(<4 x i64> %x, <4 x i64> %y) {
+; AVX512VL-LABEL: kb52h_256_mask25x26_acc1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,1,1,1]
+; AVX512VL-NEXT: retq
+ %mx = and <4 x i64> %x, <i64 33554431, i64 33554431, i64 33554431, i64 33554431>
+ %my = and <4 x i64> %y, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+ %r = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(
+ <4 x i64> <i64 1, i64 1, i64 1, i64 1>,
+ <4 x i64> %mx,
+ <4 x i64> %my)
+ ret <4 x i64> %r
+}
+
+define <8 x i64> @kb52h_512_mask25_and1(<8 x i64> %x, <8 x i64> %y) {
+; AVX512VL-LABEL: kb52h_512_mask25_and1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastsd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
+; AVX512VL-NEXT: retq
+ %mx = and <8 x i64> %x, <i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431>
+ %my = and <8 x i64> %y, <i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431>
+ %r = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(
+ <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>,
+ <8 x i64> %mx,
+ <8 x i64> %my)
+ %ret = and <8 x i64> %r, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+ ret <8 x i64> %ret
+}
+
+
+; 26-bit = 67108863 = (1<<26)-1
+; 50-bit = 1125899906842623 = (1<<50)-1
+
+declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
+
+
+
+define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) {
+; AVX512VL-LABEL: kb52l_128_mask26x26_add_intrin:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
+; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
+; AVX512VL-NEXT: retq
+ %xm = and <2 x i64> %x, <i64 67108863, i64 67108863>
+ %ym = and <2 x i64> %y, <i64 67108863, i64 67108863>
+ %r = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %acc, <2 x i64> %xm, <2 x i64> %ym)
+ ret <2 x i64> %r
+}
+
+
+
+define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x i64> %acc) {
+; AVX512VL-LABEL: kb52l_256_mask50x3_add_intrin:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512VL-NEXT: retq
+ %xm = and <4 x i64> %x, <i64 1125899906842623, i64 1125899906842623, i64 1125899906842623, i64 1125899906842623>
+ %ym = and <4 x i64> %y, <i64 3, i64 3, i64 3, i64 3>
+ %r = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %acc, <4 x i64> %xm, <4 x i64> %ym)
+ ret <4 x i64> %r
+}
+
+
+
+define <8 x i64> @kb52l_512_mask26x26_add_intrin(<8 x i64> %x, <8 x i64> %y, <8 x i64> %acc) {
+; AVX512-NOVL-LABEL: kb52l_512_mask26x26_add_intrin:
+; AVX512-NOVL: vpmadd52luq
+; AVX512-NOVL: retq
+; AVX512VL-LABEL: kb52l_512_mask26x26_add_intrin:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; AVX512VL-NEXT: vpandq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2
+; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512VL-NEXT: retq
+ %xm = and <8 x i64> %x, <i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+ %ym = and <8 x i64> %y, <i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+ %r = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %acc, <8 x i64> %xm, <8 x i64> %ym)
+ ret <8 x i64> %r
+}
+
+
+
+
+define <2 x i64> @kb52l_128_neg_27x27_plain(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) {
+; AVX512VL-LABEL: kb52l_128_neg_27x27_plain:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108864,67108864]
+; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+ %xm = and <2 x i64> %x, <i64 67108864, i64 67108864> ; 1<<26
+ %ym = and <2 x i64> %y, <i64 67108864, i64 67108864>
+ %mul = mul <2 x i64> %xm, %ym
+ %res = add <2 x i64> %mul, %acc
+ ret <2 x i64> %res
+}
+
>From c5100dcee32919cd250088ece985123e6bf231ab Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Wed, 17 Sep 2025 10:39:09 +0800
Subject: [PATCH 13/15] Remove unintended changes to DAGCombiner.cpp
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 -------------------
1 file changed, 34 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8f4e84a34a8bd..4b20b756f8a15 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16354,40 +16354,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
// because targets may prefer a wider type during later combines and invert
// this transform.
switch (N0.getOpcode()) {
- case ISD::AVGCEILU:
- case ISD::AVGFLOORU:
- if (!LegalOperations && N0.hasOneUse() &&
- TLI.isOperationLegal(N0.getOpcode(), VT)) {
- SDValue X = N0.getOperand(0);
- SDValue Y = N0.getOperand(1);
- unsigned SrcBits = X.getScalarValueSizeInBits();
- unsigned DstBits = VT.getScalarSizeInBits();
- APInt UpperBits = APInt::getBitsSetFrom(SrcBits, DstBits);
- if (DAG.MaskedValueIsZero(X, UpperBits) &&
- DAG.MaskedValueIsZero(Y, UpperBits)) {
- SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
- SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
- return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
- }
- }
- break;
- case ISD::AVGCEILS:
- case ISD::AVGFLOORS:
- if (!LegalOperations && N0.hasOneUse() &&
- TLI.isOperationLegal(N0.getOpcode(), VT)) {
- SDValue X = N0.getOperand(0);
- SDValue Y = N0.getOperand(1);
- unsigned SrcBits = X.getScalarValueSizeInBits();
- unsigned DstBits = VT.getScalarSizeInBits();
- unsigned NeededSignBits = SrcBits - DstBits + 1;
- if (DAG.ComputeNumSignBits(X) >= NeededSignBits &&
- DAG.ComputeNumSignBits(Y) >= NeededSignBits) {
- SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
- SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
- return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
- }
- }
- break;
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
>From 27f0f4295c972e3b5611f13352c79d24c04a8bcf Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 19 Sep 2025 00:19:05 +0800
Subject: [PATCH 14/15] update test case
---
llvm/test/CodeGen/X86/knownbits-vpmadd52.ll | 137 ++++++++------------
1 file changed, 52 insertions(+), 85 deletions(-)
diff --git a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll
index 0b5be5fc9900b..b3f7fe205a958 100644
--- a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll
@@ -1,15 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxifma | FileCheck %s --check-prefixes=AVXIFMA
-
-; H path: take the high 52 bits of the product and add them to the accumulator
-; 25-bit = (1<<25)-1 = 33554431
-; 26-bit = (1<<26)-1 = 67108863
+; High-52 path
declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
-declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) {
; AVX512VL-LABEL: kb52h_128_mask25_and1:
@@ -17,13 +14,19 @@ define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) {
; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = [1,1]
; AVX512VL-NEXT: # xmm0 = mem[0,0]
; AVX512VL-NEXT: retq
- %mx = and <2 x i64> %x, <i64 33554431, i64 33554431>
- %my = and <2 x i64> %y, <i64 33554431, i64 33554431>
+;
+; AVXIFMA-LABEL: kb52h_128_mask25_and1:
+; AVXIFMA: # %bb.0:
+; AVXIFMA-NEXT: vmovddup {{.*#+}} xmm0 = [1,1]
+; AVXIFMA-NEXT: # xmm0 = mem[0,0]
+; AVXIFMA-NEXT: retq
+ %mx = and <2 x i64> %x, splat (i64 33554431) ; (1<<25)-1
+ %my = and <2 x i64> %y, splat (i64 33554431) ; (1<<25)-1
%r = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(
- <2 x i64> <i64 1, i64 1>, ; acc
- <2 x i64> %mx, ; x (masked to 25-bit)
- <2 x i64> %my) ; y (masked to 25-bit)
- %ret = and <2 x i64> %r, <i64 1, i64 1>
+ <2 x i64> splat (i64 1),
+ <2 x i64> %mx,
+ <2 x i64> %my)
+ %ret = and <2 x i64> %r, splat (i64 1)
ret <2 x i64> %ret
}
@@ -32,39 +35,23 @@ define <4 x i64> @kb52h_256_mask25x26_acc1(<4 x i64> %x, <4 x i64> %y) {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,1,1,1]
; AVX512VL-NEXT: retq
- %mx = and <4 x i64> %x, <i64 33554431, i64 33554431, i64 33554431, i64 33554431>
- %my = and <4 x i64> %y, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+;
+; AVXIFMA-LABEL: kb52h_256_mask25x26_acc1:
+; AVXIFMA: # %bb.0:
+; AVXIFMA-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,1,1,1]
+; AVXIFMA-NEXT: retq
+ %mx = and <4 x i64> %x, splat (i64 33554431) ; (1<<25)-1
+ %my = and <4 x i64> %y, splat (i64 67108863) ; (1<<26)-1
%r = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(
- <4 x i64> <i64 1, i64 1, i64 1, i64 1>,
- <4 x i64> %mx,
- <4 x i64> %my)
+ <4 x i64> splat (i64 1),
+ <4 x i64> %mx, <4 x i64> %my)
ret <4 x i64> %r
}
-define <8 x i64> @kb52h_512_mask25_and1(<8 x i64> %x, <8 x i64> %y) {
-; AVX512VL-LABEL: kb52h_512_mask25_and1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vbroadcastsd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
-; AVX512VL-NEXT: retq
- %mx = and <8 x i64> %x, <i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431>
- %my = and <8 x i64> %y, <i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431, i64 33554431>
- %r = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(
- <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>,
- <8 x i64> %mx,
- <8 x i64> %my)
- %ret = and <8 x i64> %r, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
- ret <8 x i64> %ret
-}
-
-
-; 26-bit = 67108863 = (1<<26)-1
-; 50-bit = 1125899906842623 = (1<<50)-1
+; Low-52 path
declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
-declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
-
-
define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) {
; AVX512VL-LABEL: kb52l_128_mask26x26_add_intrin:
@@ -75,14 +62,22 @@ define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2
; AVX512VL-NEXT: vpmadd52luq %xmm1, %xmm0, %xmm2
; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VL-NEXT: retq
- %xm = and <2 x i64> %x, <i64 67108863, i64 67108863>
- %ym = and <2 x i64> %y, <i64 67108863, i64 67108863>
- %r = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %acc, <2 x i64> %xm, <2 x i64> %ym)
+;
+; AVXIFMA-LABEL: kb52l_128_mask26x26_add_intrin:
+; AVXIFMA: # %bb.0:
+; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
+; AVXIFMA-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVXIFMA-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVXIFMA-NEXT: {vex} vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVXIFMA-NEXT: vmovdqa %xmm2, %xmm0
+; AVXIFMA-NEXT: retq
+ %xm = and <2 x i64> %x, splat (i64 67108863) ; (1<<26)-1
+ %ym = and <2 x i64> %y, splat (i64 67108863) ; (1<<26)-1
+ %r = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(
+ <2 x i64> %acc, <2 x i64> %xm, <2 x i64> %ym)
ret <2 x i64> %r
}
-
-
define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x i64> %acc) {
; AVX512VL-LABEL: kb52l_256_mask50x3_add_intrin:
; AVX512VL: # %bb.0:
@@ -91,48 +86,20 @@ define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x
; AVX512VL-NEXT: vpmadd52luq %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
- %xm = and <4 x i64> %x, <i64 1125899906842623, i64 1125899906842623, i64 1125899906842623, i64 1125899906842623>
- %ym = and <4 x i64> %y, <i64 3, i64 3, i64 3, i64 3>
- %r = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %acc, <4 x i64> %xm, <4 x i64> %ym)
+;
+; AVXIFMA-LABEL: kb52l_256_mask50x3_add_intrin:
+; AVXIFMA: # %bb.0:
+; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623]
+; AVXIFMA-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm3 = [3,3,3,3]
+; AVXIFMA-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVXIFMA-NEXT: vmovdqa %ymm2, %ymm0
+; AVXIFMA-NEXT: retq
+ %xm = and <4 x i64> %x, splat (i64 1125899906842623) ; (1<<50)-1
+ %ym = and <4 x i64> %y, splat (i64 3)
+ %r = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(
+ <4 x i64> %acc, <4 x i64> %xm, <4 x i64> %ym)
ret <4 x i64> %r
}
-
-
-define <8 x i64> @kb52l_512_mask26x26_add_intrin(<8 x i64> %x, <8 x i64> %y, <8 x i64> %acc) {
-; AVX512-NOVL-LABEL: kb52l_512_mask26x26_add_intrin:
-; AVX512-NOVL: vpmadd52luq
-; AVX512-NOVL: retq
-; AVX512VL-LABEL: kb52l_512_mask26x26_add_intrin:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
-; AVX512VL-NEXT: vpandq %zmm3, %zmm0, %zmm0
-; AVX512VL-NEXT: vpandq %zmm3, %zmm1, %zmm1
-; AVX512VL-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2
-; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0
-; AVX512VL-NEXT: retq
- %xm = and <8 x i64> %x, <i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863>
- %ym = and <8 x i64> %y, <i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863, i64 67108863>
- %r = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %acc, <8 x i64> %xm, <8 x i64> %ym)
- ret <8 x i64> %r
-}
-
-
-
-
-define <2 x i64> @kb52l_128_neg_27x27_plain(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) {
-; AVX512VL-LABEL: kb52l_128_neg_27x27_plain:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108864,67108864]
-; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
- %xm = and <2 x i64> %x, <i64 67108864, i64 67108864> ; 1<<26
- %ym = and <2 x i64> %y, <i64 67108864, i64 67108864>
- %mul = mul <2 x i64> %xm, %ym
- %res = add <2 x i64> %mul, %acc
- ret <2 x i64> %res
-}
-
>From efeb7402d3a899e2a420cdf8057408e331080834 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Mon, 22 Sep 2025 11:23:18 +0800
Subject: [PATCH 15/15] update test case: knownbits-vpmadd52.ll
---
llvm/test/CodeGen/X86/knownbits-vpmadd52.ll | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll
index b3f7fe205a958..0e322fec2c7d9 100644
--- a/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/knownbits-vpmadd52.ll
@@ -8,6 +8,7 @@
declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+; High-52, 25x25 masked inputs, accumulator = 1, expected constant fold.
define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) {
; AVX512VL-LABEL: kb52h_128_mask25_and1:
; AVX512VL: # %bb.0:
@@ -30,6 +31,7 @@ define <2 x i64> @kb52h_128_mask25_and1(<2 x i64> %x, <2 x i64> %y) {
ret <2 x i64> %ret
}
+; High-52, 25x26 masked inputs, accumulator = 1, expected constant fold.
define <4 x i64> @kb52h_256_mask25x26_acc1(<4 x i64> %x, <4 x i64> %y) {
; AVX512VL-LABEL: kb52h_256_mask25x26_acc1:
; AVX512VL: # %bb.0:
@@ -53,6 +55,7 @@ define <4 x i64> @kb52h_256_mask25x26_acc1(<4 x i64> %x, <4 x i64> %y) {
declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+; Low-52, 26x26 masked inputs, add with accumulator.
define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2 x i64> %acc) {
; AVX512VL-LABEL: kb52l_128_mask26x26_add_intrin:
; AVX512VL: # %bb.0:
@@ -78,6 +81,7 @@ define <2 x i64> @kb52l_128_mask26x26_add_intrin(<2 x i64> %x, <2 x i64> %y, <2
ret <2 x i64> %r
}
+; Low-52, 50-bit × 2-bit masked inputs, add with accumulator.
define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x i64> %acc) {
; AVX512VL-LABEL: kb52l_256_mask50x3_add_intrin:
; AVX512VL: # %bb.0:
@@ -97,7 +101,7 @@ define <4 x i64> @kb52l_256_mask50x3_add_intrin(<4 x i64> %x, <4 x i64> %y, <4 x
; AVXIFMA-NEXT: vmovdqa %ymm2, %ymm0
; AVXIFMA-NEXT: retq
%xm = and <4 x i64> %x, splat (i64 1125899906842623) ; (1<<50)-1
- %ym = and <4 x i64> %y, splat (i64 3)
+ %ym = and <4 x i64> %y, splat (i64 3) ; (1<<2)-1
%r = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(
<4 x i64> %acc, <4 x i64> %xm, <4 x i64> %ym)
ret <4 x i64> %r
More information about the llvm-commits
mailing list