[llvm] [DAG] Add TRUNCATE_SSAT_S/U and TRUNCATE_USAT_U to canCreateUndefOrPoison and computeKnownBits (#152143) (PR #168809)

Sun Nov 23 15:33:16 PST 2025

================
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+;; ============================================================================
+;; Tests for canCreateUndefOrPoison = false
+;; These verify that freeze operations are correctly eliminated
+;; ============================================================================
+
+; TRUNCATE_SSAT_S: No saturation path
+define i1 @sqxtn_no_sat_with_freeze(<4 x i32> %x) {
+; CHECK-LABEL: sqxtn_no_sat_with_freeze:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #100
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    sqxtn v0.4h, v0.4s
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    and w8, w8, #0xfffc
+; CHECK-NEXT:    cmp w8, #200
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %masked = and <4 x i32> %x, <i32 100, i32 100, i32 100, i32 100>
+  %trunc = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %masked)
+  %freeze = freeze <4 x i16> %trunc
+  %extract = extractelement <4 x i16> %freeze, i32 0
+  ; Input is [0,100], so result > 200 is always false
+  %cmp = icmp sgt i16 %extract, 200
+  ret i1 %cmp
+}
+
+; TRUNCATE_SSAT_S: Test specific known bits
+define i16 @sqxtn_known_bits(<4 x i32> %x) {
+; CHECK-LABEL: sqxtn_known_bits:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  ; Input: [0, 32512] fits in i16 without saturation
+  %masked = and <4 x i32> %x, <i32 32512, i32 32512, i32 32512, i32 32512>
+  %trunc = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %masked)
+  %freeze = freeze <4 x i16> %trunc
+  %extract = extractelement <4 x i16> %freeze, i32 0
+  ; Mask to lower 7 bits - with KnownBits, knows upper bits are already 0
+  ; so this AND can be simplified
+  %and = and i16 %extract, 127
+  ret i16 %and
+}
+
+;; ============================================================================
----------------
kuroyukiasuna wrote:

computeKnownBits

`TRUNCATE_SSAT_S` verified behaving correctly together with `canCreateUndefOrPoison`.

`TRUNCATE_SSAT_U` and `TRUNCATE_USAT_U` do not demonstrate the same - optimizations like constant folding don't trigger.

Root Cause: Vector constant propagation issue. When analyzing operations like:
```
%masked = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
%trunc = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %masked)
```
The constant vector gets lowered to AArch64-specific nodes (NVCAST → MOVIedit) which lose the constant information. During `computeKnownBits` analysis:
```
TRUNCATE_SSAT_U: InputKnown = ????????????????????????????????
  Input range (signed): [-2147483648, 2147483647]
```
The broken constants appear to be byte-aligned all-ones values that the AArch64 backend optimizes into MOVI immediate instructions, losing the constant value during the transformation.

Why TRUNCATE_SSAT_S works: Unclear.

https://github.com/llvm/llvm-project/pull/168809