[llvm] [AArch64][SVE] Rework VECTOR_COMPRESS lowering (PR #171162)

Tue Dec 9 02:50:51 PST 2025

================
@@ -242,14 +238,10 @@ define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %m
 ; CHECK-NEXT:    lsr x9, x8, #32
 ; CHECK-NEXT:    eor w8, w8, w9
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    cmlt v3.2d, v3.2d, #0
-; CHECK-NEXT:    cmlt v4.2d, v4.2d, #0
-; CHECK-NEXT:    and x8, x8, #0x3
-; CHECK-NEXT:    lsl x8, x8, #3
-; CHECK-NEXT:    and z3.d, z3.d, #0x1
-; CHECK-NEXT:    and z4.d, z4.d, #0x1
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z3.d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z4.d, #0
+; CHECK-NEXT:    and x8, x8, #0x3
+; CHECK-NEXT:    lsl x8, x8, #3
 ; CHECK-NEXT:    compact z0.d, p1, z0.d
 ; CHECK-NEXT:    compact z1.d, p0, z1.d
----------------
MacDue wrote:

Here's my attempt:
```
ushll v2.4s, v2.4h, #0  # Unpack v4i16 predicate to v4i32
ptrue p0.d, vl2
ushll v3.2d, v2.2s, #0  # Unpack low half of v4i32 to v2i64
ushll2 v4.2d, v2.4s, #0 # Unpack high half of v4i32 to v2i64

fmov x8, d2             # Move d2 (the low half of the v4i32 predicate into x8)
shl v3.2d, v3.2d, #63   # Shift up predicate (i.e. ignore everything but bit 0)
shl v4.2d, v4.2d, #63   # ""
lsr x9, x8, #32   		# Move high half of v2i32 mask to x9

eor w8, w8, w9          # BUG!!! Due to incorrect usage of VECREDUCE_ADD in SplitVecRes_VECTOR_COMPRESS
						# This should be an ADD. This is not related to this PR.
						# This should reduce the mask to the number of active lanes.
						
mov x9, sp
cmpne p1.d, p0/z, z3.d, #0  # Convert lower Neon predicate to SVE 
cmpne p0.d, p0/z, z4.d, #0  # Convert upper Neon predicate to SVE
and x8, x8, #0x3        
lsl x8, x8, #3          # Compute upper offset
compact z0.d, p1, z0.d  # Compact lower half
compact z1.d, p0, z1.d  # Compact upper half
str q0, [sp]            # Store lower half
str q1, [x9, x8]        # Store upper half
```
Note: While doing this I spotted a bug (unrelated to this PR, the issue is in `SplitVecRes_VECTOR_COMPRESS`), that means this lowering is incorrect. I'll address this in another PR. 

https://github.com/llvm/llvm-project/pull/171162