[llvm] [DAG] Fold trunc(avg(x, y)) for avgceil/floor u/s nodes if they have sufficient leading zero/sign bits (PR #152273)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 21:56:04 PDT 2025
https://github.com/houngkoungting updated https://github.com/llvm/llvm-project/pull/152273
>From 80e303c6e0976d8c2437a806679a54d5919c5917 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Wed, 6 Aug 2025 16:17:48 +0800
Subject: [PATCH 1/5] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if
they have sufficient leading zero/sign bits-1
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 +++++++++++++++++++
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 43 ++++++++++++++++++
2 files changed, 88 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d70e96938ed9a..9ff256f8090ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16294,6 +16294,51 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
// because targets may prefer a wider type during later combines and invert
// this transform.
switch (N0.getOpcode()) {
+ case ISD::AVGCEILU:
+ case ISD::AVGFLOORU:
+ if (!LegalOperations && N0.hasOneUse() &&
+ TLI.isOperationLegal(N0.getOpcode(), VT)) {
+ SDValue X = N0.getOperand(0);
+ SDValue Y = N0.getOperand(1);
+
+ KnownBits KnownX = DAG.computeKnownBits(X);
+ KnownBits KnownY = DAG.computeKnownBits(Y);
+
+ unsigned SrcBits = X.getScalarValueSizeInBits();
+ unsigned DstBits = VT.getScalarSizeInBits();
+ unsigned NeededLeadingZeros = SrcBits - DstBits + 1;
+
+ if (KnownX.countMinLeadingZeros() >= NeededLeadingZeros &&
+ KnownY.countMinLeadingZeros() >= NeededLeadingZeros) {
+ SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+ SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
+ return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
+ }
+ }
+ break;
+
+ case ISD::AVGCEILS:
+ case ISD::AVGFLOORS:
+ if (!LegalOperations && N0.hasOneUse() &&
+ TLI.isOperationLegal(N0.getOpcode(), VT)) {
+ SDValue X = N0.getOperand(0);
+ SDValue Y = N0.getOperand(1);
+
+ unsigned SignBitsX = DAG.ComputeNumSignBits(X);
+ unsigned SignBitsY = DAG.ComputeNumSignBits(Y);
+
+ unsigned SrcBits = X.getScalarValueSizeInBits();
+ unsigned DstBits = VT.getScalarSizeInBits();
+ unsigned NeededSignBits = SrcBits - DstBits + 1;
+
+ if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
+ SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+ SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
+ return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
+ }
+ }
+ break;
+
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
new file mode 100644
index 0000000000000..175f54d6f9c05
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
+
+; CHECK-LABEL: test_avgceil_u
+; CHECK: uhadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
+ %ta = trunc <8 x i16> %a to <8 x i8>
+ %tb = trunc <8 x i16> %b to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+ ret <8 x i8> %res
+}
+
+; CHECK-LABEL: test_avgceil_s
+; CHECK: shadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
+ %ta = trunc <8 x i16> %a to <8 x i8>
+ %tb = trunc <8 x i16> %b to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+ ret <8 x i8> %res
+}
+
+; CHECK-LABEL: test_avgfloor_u
+; CHECK: urhadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
+ %ta = trunc <8 x i16> %a to <8 x i8>
+ %tb = trunc <8 x i16> %b to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+ ret <8 x i8> %res
+}
+
+; CHECK-LABEL: test_avgfloor_s
+; CHECK: srhadd v0.8b, v0.8b, v1.8b
+define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
+ %ta = trunc <8 x i16> %a to <8 x i8>
+ %tb = trunc <8 x i16> %b to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+ ret <8 x i8> %res
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
+
>From 24287f7f08d3bd238761b6e798aee655af931c3d Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 8 Aug 2025 00:04:32 +0800
Subject: [PATCH 2/5] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if
they have sufficient leading zero/sign bits -2
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 +---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 96 +++++++++++++++----
2 files changed, 79 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9ff256f8090ba..0cba06215d3fe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16300,37 +16300,28 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
TLI.isOperationLegal(N0.getOpcode(), VT)) {
SDValue X = N0.getOperand(0);
SDValue Y = N0.getOperand(1);
-
- KnownBits KnownX = DAG.computeKnownBits(X);
- KnownBits KnownY = DAG.computeKnownBits(Y);
-
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
- unsigned NeededLeadingZeros = SrcBits - DstBits + 1;
-
- if (KnownX.countMinLeadingZeros() >= NeededLeadingZeros &&
- KnownY.countMinLeadingZeros() >= NeededLeadingZeros) {
+ unsigned MaxBitsX = DAG.ComputeMaxSignificantBits(X);
+ unsigned MaxBitsY = DAG.ComputeMaxSignificantBits(Y);
+ if (MaxBitsX <= DstBits && MaxBitsY <= DstBits) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
}
}
break;
-
case ISD::AVGCEILS:
case ISD::AVGFLOORS:
if (!LegalOperations && N0.hasOneUse() &&
TLI.isOperationLegal(N0.getOpcode(), VT)) {
SDValue X = N0.getOperand(0);
SDValue Y = N0.getOperand(1);
-
unsigned SignBitsX = DAG.ComputeNumSignBits(X);
unsigned SignBitsY = DAG.ComputeNumSignBits(Y);
-
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
unsigned NeededSignBits = SrcBits - DstBits + 1;
-
if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
@@ -16338,7 +16329,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
}
}
break;
-
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index 175f54d6f9c05..db40746776d43 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -1,38 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
-; CHECK-LABEL: test_avgceil_u
-; CHECK: uhadd v0.8b, v0.8b, v1.8b
+
define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
- %ta = trunc <8 x i16> %a to <8 x i8>
- %tb = trunc <8 x i16> %b to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgceil_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+
+ %mask = insertelement <8 x i16> undef, i16 255, i32 0
+ %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+ %ta16 = and <8 x i16> %a, %mask.splat
+ %tb16 = and <8 x i16> %b, %mask.splat
+ %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
+ %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-; CHECK-LABEL: test_avgceil_s
-; CHECK: shadd v0.8b, v0.8b, v1.8b
+
define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
- %ta = trunc <8 x i16> %a to <8 x i8>
- %tb = trunc <8 x i16> %b to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgceil_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqxtn v0.8b, v0.8h
+; CHECK-NEXT: sqxtn v1.8b, v1.8h
+; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+
+ %smin = insertelement <8 x i16> undef, i16 -128, i32 0
+ %smax = insertelement <8 x i16> undef, i16 127, i32 0
+ %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
+ %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
+
+ %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
+ %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-; CHECK-LABEL: test_avgfloor_u
-; CHECK: urhadd v0.8b, v0.8b, v1.8b
+
define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
- %ta = trunc <8 x i16> %a to <8 x i8>
- %tb = trunc <8 x i16> %b to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgfloor_u:
+; CHECK: // %bb.0:
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+
+ %mask = insertelement <8 x i16> undef, i16 255, i32 0
+ %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+ %ta16 = and <8 x i16> %a, %mask.splat
+ %tb16 = and <8 x i16> %b, %mask.splat
+ %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
+ %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-; CHECK-LABEL: test_avgfloor_s
-; CHECK: srhadd v0.8b, v0.8b, v1.8b
+
define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
- %ta = trunc <8 x i16> %a to <8 x i8>
- %tb = trunc <8 x i16> %b to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta, <8 x i8> %tb)
+; CHECK-LABEL: test_avgfloor_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqxtn v0.8b, v0.8h
+; CHECK-NEXT: sqxtn v1.8b, v1.8h
+; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+
+ %smin = insertelement <8 x i16> undef, i16 -128, i32 0
+ %smax = insertelement <8 x i16> undef, i16 127, i32 0
+ %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
+ %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
+
+ %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
+ %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
+ %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
@@ -41,3 +94,6 @@ declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
+
>From c8cc2a98030154d6a95154d8fe8d7461cfb0daf4 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 8 Aug 2025 10:51:33 +0800
Subject: [PATCH 3/5] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if
they have sufficient leading zero/sign bits -3
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 +++++---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 20 +++++++++----------
2 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0cba06215d3fe..7aea288c03208 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16302,9 +16302,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
SDValue Y = N0.getOperand(1);
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
- unsigned MaxBitsX = DAG.ComputeMaxSignificantBits(X);
- unsigned MaxBitsY = DAG.ComputeMaxSignificantBits(Y);
- if (MaxBitsX <= DstBits && MaxBitsY <= DstBits) {
+ KnownBits KnownX = DAG.computeKnownBits(X);
+ KnownBits KnownY = DAG.computeKnownBits(Y);
+ if (KnownX.countMinLeadingZeros() >= (SrcBits - DstBits) &&
+ KnownY.countMinLeadingZeros() >= (SrcBits - DstBits)) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty);
@@ -16322,6 +16323,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
unsigned SrcBits = X.getScalarValueSizeInBits();
unsigned DstBits = VT.getScalarSizeInBits();
unsigned NeededSignBits = SrcBits - DstBits + 1;
+
if (SignBitsX >= NeededSignBits && SignBitsY >= NeededSignBits) {
SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y);
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index db40746776d43..ede39e237a9c9 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -10,8 +10,8 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %mask = insertelement <8 x i16> undef, i16 255, i32 0
- %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+ %mask = insertelement <8 x i16> poison, i16 255, i32 0
+ %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
%tb16 = and <8 x i16> %b, %mask.splat
%ta8 = trunc <8 x i16> %ta16 to <8 x i8>
@@ -29,10 +29,10 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %smin = insertelement <8 x i16> undef, i16 -128, i32 0
- %smax = insertelement <8 x i16> undef, i16 127, i32 0
- %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
- %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+ %smin = insertelement <8 x i16> poison, i16 -128, i32 0
+ %smax = insertelement <8 x i16> poison, i16 127, i32 0
+ %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
+ %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
@@ -73,10 +73,10 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
- %smin = insertelement <8 x i16> undef, i16 -128, i32 0
- %smax = insertelement <8 x i16> undef, i16 127, i32 0
- %min = shufflevector <8 x i16> %smin, <8 x i16> undef, <8 x i32> zeroinitializer
- %max = shufflevector <8 x i16> %smax, <8 x i16> undef, <8 x i32> zeroinitializer
+ %smin = insertelement <8 x i16> poison, i16 -128, i32 0
+ %smax = insertelement <8 x i16> poison, i16 127, i32 0
+ %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
+ %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
>From 11152562f1255a4fcd60404d1e08ca80bf422090 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 8 Aug 2025 11:40:46 +0800
Subject: [PATCH 4/5] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if
they have sufficient leading zero/sign bits-4
---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 59 ++++++++-------------
1 file changed, 22 insertions(+), 37 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index ede39e237a9c9..4d4e828a751bd 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s
-
define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgceil_u:
; CHECK: // %bb.0:
@@ -9,7 +8,6 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: xtn v1.8b, v1.8h
; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
-
%mask = insertelement <8 x i16> poison, i16 255, i32 0
%mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
@@ -20,7 +18,6 @@ define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
ret <8 x i8> %res
}
-
define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgceil_s:
; CHECK: // %bb.0:
@@ -28,72 +25,60 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: sqxtn v1.8b, v1.8h
; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
-
- %smin = insertelement <8 x i16> poison, i16 -128, i32 0
- %smax = insertelement <8 x i16> poison, i16 127, i32 0
- %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
- %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
-
- %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
- %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
- %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
- %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
-
+ %min = insertelement <8 x i16> poison, i16 -128, i32 0
+ %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
+ %max = insertelement <8 x i16> poison, i16 127, i32 0
+ %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat)
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
%ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
%tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
%res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-
define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgfloor_u:
; CHECK: // %bb.0:
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
-
- %mask = insertelement <8 x i16> undef, i16 255, i32 0
- %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> undef, <8 x i32> zeroinitializer
+ %mask = insertelement <8 x i16> poison, i16 255, i32 0
+ %mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
%tb16 = and <8 x i16> %b, %mask.splat
%ta8 = trunc <8 x i16> %ta16 to <8 x i8>
%tb8 = trunc <8 x i16> %tb16 to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
-
define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgfloor_s:
; CHECK: // %bb.0:
; CHECK-NEXT: sqxtn v0.8b, v0.8h
; CHECK-NEXT: sqxtn v1.8b, v1.8h
-; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
-
- %smin = insertelement <8 x i16> poison, i16 -128, i32 0
- %smax = insertelement <8 x i16> poison, i16 127, i32 0
- %min = shufflevector <8 x i16> %smin, <8 x i16> poison, <8 x i32> zeroinitializer
- %max = shufflevector <8 x i16> %smax, <8 x i16> poison, <8 x i32> zeroinitializer
-
- %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max)
- %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min)
- %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max)
- %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min)
-
+ %min = insertelement <8 x i16> poison, i16 -128, i32 0
+ %min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
+ %max = insertelement <8 x i16> poison, i16 127, i32 0
+ %max.splat = shufflevector <8 x i16> %max, <8 x i16> poison, <8 x i32> zeroinitializer
+ %ta16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %max.splat)
+ %ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
+ %tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
+ %tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
%ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
%tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
ret <8 x i8> %res
}
declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>)
declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
-
declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
>From 08138a2fde9896a580d11a2b4249eea86d42fefe Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Fri, 8 Aug 2025 12:55:44 +0800
Subject: [PATCH 5/5] [DAG] Fold trunc(avg(x,y)) for avgceil/floor u/s nodes if
they have sufficient leading zero/sign bits-5
---
llvm/test/CodeGen/AArch64/trunc-avg-fold.ll | 52 ++++++++++++---------
1 file changed, 30 insertions(+), 22 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
index 4d4e828a751bd..36fddedd78df6 100644
--- a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll
@@ -4,26 +4,31 @@
define <8 x i8> @test_avgceil_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgceil_u:
; CHECK: // %bb.0:
+; CHECK-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%mask = insertelement <8 x i16> poison, i16 255, i32 0
%mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
%tb16 = and <8 x i16> %b, %mask.splat
- %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
- %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgceil_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqxtn v0.8b, v0.8h
-; CHECK-NEXT: sqxtn v1.8b, v1.8h
-; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: movi v2.8h, #127
+; CHECK-NEXT: mvni v3.8h, #127
+; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
+; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
+; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%min = insertelement <8 x i16> poison, i16 -128, i32 0
%min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -33,35 +38,39 @@ define <8 x i8> @test_avgceil_s(<8 x i16> %a, <8 x i16> %b) {
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
%tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
%tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
- %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
- %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
define <8 x i8> @test_avgfloor_u(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgfloor_u:
; CHECK: // %bb.0:
+; CHECK-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%mask = insertelement <8 x i16> poison, i16 255, i32 0
%mask.splat = shufflevector <8 x i16> %mask, <8 x i16> poison, <8 x i32> zeroinitializer
%ta16 = and <8 x i16> %a, %mask.splat
%tb16 = and <8 x i16> %b, %mask.splat
- %ta8 = trunc <8 x i16> %ta16 to <8 x i8>
- %tb8 = trunc <8 x i16> %tb16 to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %ta16, <8 x i16> %tb16)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_avgfloor_s:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqxtn v0.8b, v0.8h
-; CHECK-NEXT: sqxtn v1.8b, v1.8h
-; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: movi v2.8h, #127
+; CHECK-NEXT: mvni v3.8h, #127
+; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: smin v1.8h, v1.8h, v2.8h
+; CHECK-NEXT: smax v0.8h, v0.8h, v3.8h
+; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h
+; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%min = insertelement <8 x i16> poison, i16 -128, i32 0
%min.splat = shufflevector <8 x i16> %min, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -71,9 +80,8 @@ define <8 x i8> @test_avgfloor_s(<8 x i16> %a, <8 x i16> %b) {
%ta16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %ta16, <8 x i16> %min.splat)
%tb16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %b, <8 x i16> %max.splat)
%tb16.clamped = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %tb16, <8 x i16> %min.splat)
- %ta8 = trunc <8 x i16> %ta16.clamped to <8 x i8>
- %tb8 = trunc <8 x i16> %tb16.clamped to <8 x i8>
- %res = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %ta8, <8 x i8> %tb8)
+ %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %ta16.clamped, <8 x i16> %tb16.clamped)
+ %res = trunc <8 x i16> %avg16 to <8 x i8>
ret <8 x i8> %res
}
More information about the llvm-commits
mailing list