[llvm] [DAG] Add TRUNCATE_SSAT_S/U and TRUNCATE_USAT_U to canCreateUndefOrPoison (#152143) (PR #168809)
Jerry Dang via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 23 15:25:13 PST 2025
https://github.com/kuroyukiasuna updated https://github.com/llvm/llvm-project/pull/168809
>From 2af6457e115bd2bebd335416a936b2ae43de4851 Mon Sep 17 00:00:00 2001
From: Jerry Dang <kuroyukiasuna at gmail.com>
Date: Wed, 19 Nov 2025 20:57:26 -0500
Subject: [PATCH 1/3] [DAG] Add TRUNCATE_SSAT_S/U and TRUNCATE_USAT_U to
canCreateUndefOrPoison (#152143)
Saturating truncation operations are well-defined for all inputs and cannot create
poison or undef values. This allows the optimizer to eliminate unnecessary freeze
instructions after these operations.
Fixes #152143
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 3 +
llvm/test/CodeGen/X86/truncate-sat-freeze.ll | 64 +++++++++++++++++++
2 files changed, 67 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/truncate-sat-freeze.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1b15a207a2d37..0f0174c8aea35 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5664,6 +5664,9 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::FP_EXTEND:
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT:
+ case ISD::TRUNCATE_SSAT_U:
+ case ISD::TRUNCATE_SSAT_S:
+ case ISD::TRUNCATE_USAT_U:
// No poison except from flags (which is handled above)
return false;
diff --git a/llvm/test/CodeGen/X86/truncate-sat-freeze.ll b/llvm/test/CodeGen/X86/truncate-sat-freeze.ll
new file mode 100644
index 0000000000000..78aebe05ec1de
--- /dev/null
+++ b/llvm/test/CodeGen/X86/truncate-sat-freeze.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s
+
+; Test that freeze is eliminated for saturation truncate patterns.
+; The freeze elimination happens at the IR level due to the IntrNoCreateUndefOrPoison
+; attribute on the llvm.smax/smin/umin intrinsics. At the SelectionDAG level,
+; TRUNCATE_SSAT_S/U and TRUNCATE_USAT_U operations are also marked in
+; canCreateUndefOrPoison() to ensure consistency and enable potential future
+; optimizations. This test validates the end-to-end behavior that no freeze
+; instruction appears in the output.
+
+define <2 x i32> @trunc_ssat_s_freeze(<2 x i64> %a0) {
+; CHECK-LABEL: trunc_ssat_s_freeze:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
+; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: retq
+ %1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a0, <2 x i64> <i64 -2147483648, i64 -2147483648>)
+ %2 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %1, <2 x i64> <i64 2147483647, i64 2147483647>)
+ %3 = trunc <2 x i64> %2 to <2 x i32>
+ %4 = freeze <2 x i32> %3
+ ret <2 x i32> %4
+}
+
+define <2 x i32> @trunc_ssat_u_freeze(<2 x i64> %a0) {
+; CHECK-LABEL: trunc_ssat_u_freeze:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: retq
+ %1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a0, <2 x i64> zeroinitializer)
+ %2 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %1, <2 x i64> <i64 4294967295, i64 4294967295>)
+ %3 = trunc <2 x i64> %2 to <2 x i32>
+ %4 = freeze <2 x i32> %3
+ ret <2 x i32> %4
+}
+
+define <2 x i32> @trunc_usat_u_freeze(<2 x i64> %a0) {
+; CHECK-LABEL: trunc_usat_u_freeze:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: retq
+ %1 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a0, <2 x i64> <i64 4294967295, i64 4294967295>)
+ %2 = trunc <2 x i64> %1 to <2 x i32>
+ %3 = freeze <2 x i32> %2
+ ret <2 x i32> %3
+}
+
+declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>)
>From 1aa7f89cb92af08d4a60c79dedff0025b30af8df Mon Sep 17 00:00:00 2001
From: Jerry Dang <kuroyukiasuna at gmail.com>
Date: Thu, 20 Nov 2025 22:15:35 -0500
Subject: [PATCH 2/3] Add tests in AArch64; Remove previous tests in X86
---
.../CodeGen/AArch64/truncate-sat-freeze.ll | 80 +++++++++++++++++++
llvm/test/CodeGen/X86/truncate-sat-freeze.ll | 64 ---------------
2 files changed, 80 insertions(+), 64 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll
delete mode 100644 llvm/test/CodeGen/X86/truncate-sat-freeze.ll
diff --git a/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll b/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll
new file mode 100644
index 0000000000000..97bf1bac2a7db
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
+
+; Test that saturating truncate operations work correctly with freeze.
+; These intrinsics map to TRUNCATE_SSAT_S, TRUNCATE_SSAT_U, and TRUNCATE_USAT_U,
+; which are marked in canCreateUndefOrPoison() as not creating poison.
+; This allows freeze to be eliminated, enabling optimizations like select simplification.
+
+define <4 x i16> @sqxtn_with_freeze(<4 x i32> %a) {
+; CHECK-LABEL: sqxtn_with_freeze:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqxtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %trunc = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %a)
+ %freeze = freeze <4 x i16> %trunc
+ ret <4 x i16> %freeze
+}
+
+define <4 x i16> @sqxtun_with_freeze(<4 x i32> %a) {
+; CHECK-LABEL: sqxtun_with_freeze:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqxtun v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %trunc = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %a)
+ %freeze = freeze <4 x i16> %trunc
+ ret <4 x i16> %freeze
+}
+
+define <8 x i8> @uqxtn_with_freeze(<8 x i16> %a) {
+; CHECK-LABEL: uqxtn_with_freeze:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uqxtn v0.8b, v0.8h
+; CHECK-NEXT: ret
+ %trunc = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %a)
+ %freeze = freeze <8 x i8> %trunc
+ ret <8 x i8> %freeze
+}
+
+; Test freeze elimination enables select simplification for sqxtn
+define <4 x i16> @test_sqxtn_freeze_removal_select(<4 x i32> %a, i1 %cond) {
+; CHECK-LABEL: test_sqxtn_freeze_removal_select:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqxtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %safe_a = freeze <4 x i32> %a
+ %val = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %safe_a)
+ %frozen_val = freeze <4 x i16> %val
+ %res = select i1 %cond, <4 x i16> %frozen_val, <4 x i16> %val
+ ret <4 x i16> %res
+}
+
+; Test freeze elimination enables select simplification for sqxtun
+define <4 x i16> @test_sqxtun_freeze_removal_select(<4 x i32> %a, i1 %cond) {
+; CHECK-LABEL: test_sqxtun_freeze_removal_select:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqxtun v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %safe_a = freeze <4 x i32> %a
+ %val = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %safe_a)
+ %frozen_val = freeze <4 x i16> %val
+ %res = select i1 %cond, <4 x i16> %frozen_val, <4 x i16> %val
+ ret <4 x i16> %res
+}
+
+; Test freeze elimination enables select simplification for uqxtn
+define <8 x i8> @test_uqxtn_freeze_removal_select(<8 x i16> %a, i1 %cond) {
+; CHECK-LABEL: test_uqxtn_freeze_removal_select:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uqxtn v0.8b, v0.8h
+; CHECK-NEXT: ret
+ %safe_a = freeze <8 x i16> %a
+ %val = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %safe_a)
+ %frozen_val = freeze <8 x i8> %val
+ %res = select i1 %cond, <8 x i8> %frozen_val, <8 x i8> %val
+ ret <8 x i8> %res
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>)
+declare <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16>)
diff --git a/llvm/test/CodeGen/X86/truncate-sat-freeze.ll b/llvm/test/CodeGen/X86/truncate-sat-freeze.ll
deleted file mode 100644
index 78aebe05ec1de..0000000000000
--- a/llvm/test/CodeGen/X86/truncate-sat-freeze.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s
-
-; Test that freeze is eliminated for saturation truncate patterns.
-; The freeze elimination happens at the IR level due to the IntrNoCreateUndefOrPoison
-; attribute on the llvm.smax/smin/umin intrinsics. At the SelectionDAG level,
-; TRUNCATE_SSAT_S/U and TRUNCATE_USAT_U operations are also marked in
-; canCreateUndefOrPoison() to ensure consistency and enable potential future
-; optimizations. This test validates the end-to-end behavior that no freeze
-; instruction appears in the output.
-
-define <2 x i32> @trunc_ssat_s_freeze(<2 x i64> %a0) {
-; CHECK-LABEL: trunc_ssat_s_freeze:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
-; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: retq
- %1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a0, <2 x i64> <i64 -2147483648, i64 -2147483648>)
- %2 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %1, <2 x i64> <i64 2147483647, i64 2147483647>)
- %3 = trunc <2 x i64> %2 to <2 x i32>
- %4 = freeze <2 x i32> %3
- ret <2 x i32> %4
-}
-
-define <2 x i32> @trunc_ssat_u_freeze(<2 x i64> %a0) {
-; CHECK-LABEL: trunc_ssat_u_freeze:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
-; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: retq
- %1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a0, <2 x i64> zeroinitializer)
- %2 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %1, <2 x i64> <i64 4294967295, i64 4294967295>)
- %3 = trunc <2 x i64> %2 to <2 x i32>
- %4 = freeze <2 x i32> %3
- ret <2 x i32> %4
-}
-
-define <2 x i32> @trunc_usat_u_freeze(<2 x i64> %a0) {
-; CHECK-LABEL: trunc_usat_u_freeze:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; CHECK-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: retq
- %1 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a0, <2 x i64> <i64 4294967295, i64 4294967295>)
- %2 = trunc <2 x i64> %1 to <2 x i32>
- %3 = freeze <2 x i32> %2
- ret <2 x i32> %3
-}
-
-declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>)
>From 1bd1876716ef2b76c1390c264c1ccec405aa30ca Mon Sep 17 00:00:00 2001
From: Jerry Dang <kuroyukiasuna at gmail.com>
Date: Sun, 23 Nov 2025 16:23:29 -0500
Subject: [PATCH 3/3] Implement computeKnownBits for TRUNCATE_SSAT_S/U and
TRUNCATE_USAT_U
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 72 +++++++++++-
.../CodeGen/AArch64/truncate-sat-freeze.ll | 110 +++++++++---------
2 files changed, 127 insertions(+), 55 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0f0174c8aea35..781d7946e83ab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3956,6 +3956,76 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
Known = Known.trunc(BitWidth);
break;
}
+ case ISD::TRUNCATE_SSAT_S: {
+ // Pass through DemandedElts to the recursive call
+ KnownBits InputKnown = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ unsigned InputBits = InputKnown.getBitWidth();
+
+ APInt MinInRange = APInt::getSignedMinValue(BitWidth).sext(InputBits);
+ APInt MaxInRange = APInt::getSignedMaxValue(BitWidth).sext(InputBits);
+ APInt InputMin = InputKnown.getSignedMinValue();
+ APInt InputMax = InputKnown.getSignedMaxValue();
+
+ if (InputMin.sge(MinInRange) && InputMax.sle(MaxInRange)) {
+ Known = InputKnown.trunc(BitWidth);
+ } else if (InputMax.slt(MinInRange)) {
+ Known.makeConstant(APInt::getSignedMinValue(BitWidth));
+ } else if (InputMin.sgt(MaxInRange)) {
+ Known.makeConstant(APInt::getSignedMaxValue(BitWidth));
+ } else {
+ Known.resetAll();
+ if (InputKnown.isNegative()) {
+ Known.makeNegative();
+ Known.Zero = InputKnown.Zero.trunc(BitWidth);
+ Known.Zero.clearSignBit();
+ } else if (InputKnown.isNonNegative()) {
+ Known.makeNonNegative();
+ Known.One = InputKnown.One.trunc(BitWidth);
+ }
+ }
+ break;
+ }
+ case ISD::TRUNCATE_SSAT_U: {
+ // Signed -> Unsigned saturating truncation
+ KnownBits InputKnown = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ unsigned InputBits = InputKnown.getBitWidth();
+
+ APInt MaxInRange = APInt::getAllOnes(BitWidth).zext(InputBits);
+ APInt InputMin = InputKnown.getSignedMinValue();
+ APInt InputMax = InputKnown.getSignedMaxValue();
+
+ if (InputKnown.isNegative()) {
+ Known.setAllZero();
+ } else if (InputMin.isNonNegative() && InputMax.ule(MaxInRange)) {
+ Known = InputKnown.trunc(BitWidth);
+ } else if (InputMin.isNonNegative() && InputMin.ugt(MaxInRange)) {
+ Known.setAllOnes();
+ } else {
+ // Mixed positive/negative, or positive but might saturate.
+ // We can't assume much here.
+ Known.resetAll();
+ }
+ break;
+ }
+ case ISD::TRUNCATE_USAT_U: {
+ // Unsigned -> Unsigned saturating truncation
+ KnownBits InputKnown = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ unsigned InputBits = InputKnown.getBitWidth();
+
+ APInt MaxInRange = APInt::getLowBitsSet(InputBits, BitWidth);
+ APInt InputMax = InputKnown.getMaxValue();
+ APInt InputMin = InputKnown.getMinValue();
+
+ if (InputMax.ule(MaxInRange)) {
+ Known = InputKnown.trunc(BitWidth);
+ } else if (InputMin.ugt(MaxInRange)) {
+ Known.setAllOnes();
+ } else {
+ Known.resetAll();
+ Known.One = InputKnown.One.trunc(BitWidth);
+ }
+ break;
+ }
case ISD::AssertZext: {
EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
@@ -5664,8 +5734,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::FP_EXTEND:
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT:
- case ISD::TRUNCATE_SSAT_U:
case ISD::TRUNCATE_SSAT_S:
+ case ISD::TRUNCATE_SSAT_U:
case ISD::TRUNCATE_USAT_U:
// No poison except from flags (which is handled above)
return false;
diff --git a/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll b/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll
index 97bf1bac2a7db..5314731b234f9 100644
--- a/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll
+++ b/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll
@@ -1,78 +1,80 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
-; Test that saturating truncate operations work correctly with freeze.
-; These intrinsics map to TRUNCATE_SSAT_S, TRUNCATE_SSAT_U, and TRUNCATE_USAT_U,
-; which are marked in canCreateUndefOrPoison() as not creating poison.
-; This allows freeze to be eliminated, enabling optimizations like select simplification.
+;; ============================================================================
+;; Tests for canCreateUndefOrPoison = false
+;; These verify that freeze operations are correctly eliminated
+;; ============================================================================
-define <4 x i16> @sqxtn_with_freeze(<4 x i32> %a) {
-; CHECK-LABEL: sqxtn_with_freeze:
+; TRUNCATE_SSAT_S: No saturation path
+define i1 @sqxtn_no_sat_with_freeze(<4 x i32> %x) {
+; CHECK-LABEL: sqxtn_no_sat_with_freeze:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.4s, #100
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: sqxtn v0.4h, v0.4s
+; CHECK-NEXT: umov w8, v0.h[0]
+; CHECK-NEXT: and w8, w8, #0xfffc
+; CHECK-NEXT: cmp w8, #200
+; CHECK-NEXT: cset w0, gt
; CHECK-NEXT: ret
- %trunc = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %a)
+ %masked = and <4 x i32> %x, <i32 100, i32 100, i32 100, i32 100>
+ %trunc = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %masked)
%freeze = freeze <4 x i16> %trunc
- ret <4 x i16> %freeze
+ %extract = extractelement <4 x i16> %freeze, i32 0
+ ; Input is [0,100], so result > 200 is always false
+ %cmp = icmp sgt i16 %extract, 200
+ ret i1 %cmp
}
-define <4 x i16> @sqxtun_with_freeze(<4 x i32> %a) {
-; CHECK-LABEL: sqxtun_with_freeze:
+; TRUNCATE_SSAT_S: Test specific known bits
+define i16 @sqxtn_known_bits(<4 x i32> %x) {
+; CHECK-LABEL: sqxtn_known_bits:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqxtun v0.4h, v0.4s
+; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
- %trunc = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %a)
+ ; Input: [0, 32512] fits in i16 without saturation
+ %masked = and <4 x i32> %x, <i32 32512, i32 32512, i32 32512, i32 32512>
+ %trunc = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %masked)
%freeze = freeze <4 x i16> %trunc
- ret <4 x i16> %freeze
+ %extract = extractelement <4 x i16> %freeze, i32 0
+ ; Mask to lower 7 bits - with KnownBits, knows upper bits are already 0
+ ; so this AND can be simplified
+ %and = and i16 %extract, 127
+ ret i16 %and
}
-define <8 x i8> @uqxtn_with_freeze(<8 x i16> %a) {
-; CHECK-LABEL: uqxtn_with_freeze:
-; CHECK: // %bb.0:
-; CHECK-NEXT: uqxtn v0.8b, v0.8h
-; CHECK-NEXT: ret
- %trunc = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %a)
- %freeze = freeze <8 x i8> %trunc
- ret <8 x i8> %freeze
-}
-
-; Test freeze elimination enables select simplification for sqxtn
-define <4 x i16> @test_sqxtn_freeze_removal_select(<4 x i32> %a, i1 %cond) {
-; CHECK-LABEL: test_sqxtn_freeze_removal_select:
-; CHECK: // %bb.0:
-; CHECK-NEXT: sqxtn v0.4h, v0.4s
-; CHECK-NEXT: ret
- %safe_a = freeze <4 x i32> %a
- %val = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %safe_a)
- %frozen_val = freeze <4 x i16> %val
- %res = select i1 %cond, <4 x i16> %frozen_val, <4 x i16> %val
- ret <4 x i16> %res
-}
+;; ============================================================================
+;; Tests for computeKnownBits
+;; These verify that known bits analysis enables optimizations
+;; ============================================================================
-; Test freeze elimination enables select simplification for sqxtun
-define <4 x i16> @test_sqxtun_freeze_removal_select(<4 x i32> %a, i1 %cond) {
-; CHECK-LABEL: test_sqxtun_freeze_removal_select:
+; Constant 32512 = 0b0111111110000000 has known zero lower bits
+; After truncation, ANDing with 127 (lower 7 bits) gives 0
+define i16 @sqxtn_known_zero_bits(<4 x i32> %x) {
+; CHECK-LABEL: sqxtn_known_zero_bits:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqxtun v0.4h, v0.4s
+; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
- %safe_a = freeze <4 x i32> %a
- %val = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %safe_a)
- %frozen_val = freeze <4 x i16> %val
- %res = select i1 %cond, <4 x i16> %frozen_val, <4 x i16> %val
- ret <4 x i16> %res
+ %masked = and <4 x i32> %x, <i32 32512, i32 32512, i32 32512, i32 32512>
+ %trunc = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %masked)
+ %extract = extractelement <4 x i16> %trunc, i32 0
+ %and = and i16 %extract, 127
+ ret i16 %and
}
-; Test freeze elimination enables select simplification for uqxtn
-define <8 x i8> @test_uqxtn_freeze_removal_select(<8 x i16> %a, i1 %cond) {
-; CHECK-LABEL: test_uqxtn_freeze_removal_select:
+; Input range [0, 256] means result is always non-negative
+; Comparison with negative value should fold to false (returns 0)
+define i1 @sqxtn_known_nonnegative(<4 x i32> %x) {
+; CHECK-LABEL: sqxtn_known_nonnegative:
; CHECK: // %bb.0:
-; CHECK-NEXT: uqxtn v0.8b, v0.8h
+; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
- %safe_a = freeze <8 x i16> %a
- %val = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %safe_a)
- %frozen_val = freeze <8 x i8> %val
- %res = select i1 %cond, <8 x i8> %frozen_val, <8 x i8> %val
- ret <8 x i8> %res
+ %masked = and <4 x i32> %x, <i32 256, i32 256, i32 256, i32 256>
+ %trunc = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %masked)
+ %extract = extractelement <4 x i16> %trunc, i32 0
+ %cmp = icmp slt i16 %extract, 0
+ ret i1 %cmp
}
declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>)
More information about the llvm-commits
mailing list