<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/92269>92269</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[aarch64] Spurious optimization of `cmtst+bif+bif` to `shl+cmlt+bif+shl+cmlt+bif`
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
Validark
</td>
</tr>
</table>
<pre>
```zig
const std = @import("std");
fn expand8xu8To16xu4AsByteVector(vec: @Vector(8, u8)) @Vector(16, u8) {
return std.simd.interlace(.{ vec & @as(@Vector(8, u8), @splat(0xF)), vec >> @splat(4) });
}
fn sel(vec: anytype) @Vector(@typeInfo(@TypeOf(vec)).Vector.len, u8) {
const false_vec: @Vector(16, u8) = @splat(0);
const true_vec = ~false_vec;
return @select(u8, vec, true_vec, false_vec);
}
fn bsl2(vec1: anytype, vec2: @TypeOf(vec1), vec3: @TypeOf(vec1)) @TypeOf(vec1) {
return (vec1 & (vec2 ^ vec3)) ^ vec3;
}
fn bsl(vec1: anytype, vec2: @TypeOf(vec1), vec3: @TypeOf(vec1)) @TypeOf(vec1) {
return (vec1 & vec2) | (~vec1 & vec3);
}
export fn bad_bsl(x: u64, buffer1_half1: @Vector(16, u8), buffer1_half2: @Vector(16, u8), buffer2_half1: @Vector(16, u8), buffer2_half2: @Vector(16, u8)) @Vector(32, u8) {
const splatted = blk: {
// try to avoid https://github.com/llvm/llvm-project/issues/92211
var y: u64 = x; // spread out each bit of `x` into a nibble of `y`
// zig fmt: off
// start positions: 0b0000000000000000000000000000000000000000000000001111111111111111;
y = (y | (y << 24)); // & 0b0000000000000000000000001111111100000000000000000000000011111111 this AND is optimized out
y = (y | (y << 12)) & 0b0000000000001111000000000000111100000000000011110000000000001111;
y = (y | (y << 6)); // & 0b0000001100000011000000110000001100000011000000110000001100000011 this AND is optimized out
y = (y | (y << 3)) & 0b0001000100010001000100010001000100010001000100010001000100010001;
// zig fmt: on
break :blk y;
};
const selector_compressed = sel(expand8xu8To16xu4AsByteVector(@bitCast(splatted)) != @as(@Vector(16, u8), @splat(0)));
const selectors: [2]@Vector(16, u8) = @bitCast(std.simd.interlace(.{ selector_compressed, selector_compressed }));
return @bitCast([2]@Vector(16, u8){
bsl(
selectors[0],
buffer1_half1,
buffer2_half1,
),
bsl(
selectors[1],
buffer1_half2,
buffer2_half2,
),
});
}
```
Compiled on aarch64:
```asm
bad_bsl:
orr x9, x0, x0, lsl #24
orr x9, x9, x9, lsl #12
and x9, x9, #0xf000f000f000f
orr x9, x9, x9, lsl #6
orr x9, x9, x9, lsl #3
and x9, x9, #0x1111111111111111
fmov d4, x9
movi v5.8b, #15
and v5.8b, v4.8b, v5.8b
ushr v4.8b, v4.8b, #4
zip1 v4.16b, v5.16b, v4.16b
zip2 v5.16b, v4.16b, v4.16b
zip1 v4.16b, v4.16b, v4.16b
shl v4.16b, v4.16b, #7
cmlt v4.16b, v4.16b, #0
bif v0.16b, v2.16b, v4.16b
shl v2.16b, v5.16b, #7
cmlt v2.16b, v2.16b, #0
bif v1.16b, v3.16b, v2.16b
stp q0, q1, [x8]
ret
```
If you switch the `bsl` <--> `bsl2` names, you get:
```asm
bad_bsl:
orr x9, x0, x0, lsl #24
orr x9, x9, x9, lsl #12
and x9, x9, #0xf000f000f000f
orr x9, x9, x9, lsl #6
orr x9, x9, x9, lsl #3
and x9, x9, #0x1111111111111111
fmov d4, x9
movi v5.8b, #15
and v5.8b, v4.8b, v5.8b
ushr v4.8b, v4.8b, #4
zip1 v4.16b, v5.16b, v4.16b
cmtst v4.16b, v4.16b, v4.16b
zip2 v5.16b, v4.16b, v4.16b
zip1 v4.16b, v4.16b, v4.16b
bif v0.16b, v2.16b, v4.16b
bif v1.16b, v3.16b, v5.16b
stp q0, q1, [x8]
ret
```
arm32 has the same problem with the `and/andn` definition of `bsl`. The `XOR` definition would probably have the same problem once https://github.com/llvm/llvm-project/issues/92267 is resolved.
```asm
bad_bsl:
push {r11, lr}
lsl r1, r3, #24
orr r12, r2, r2, lsl #12
vmov.i8 d16, #0xf
orr r1, r1, r2, lsr #8
movw lr, #15
movt lr, #15
orr r1, r1, r3
lsl r3, r1, #12
orr r2, r3, r2, lsr #20
and r3, r12, lr
orr r3, r3, r3, lsl #6
orr r1, r2, r1
and r1, r1, lr
orr r2, r3, r3, lsl #3
orr r1, r1, r1, lsl #6
movw r3, #4369
orr r1, r1, r1, lsl #3
movt r3, #4369
and r2, r2, r3
and r1, r1, r3
vmov d17, r2, r1
add r1, sp, #40
vand d18, d17, d16
vshr.u8 d19, d17, #4
vld1.64 {d20, d21}, [r1]
add r1, sp, #56
vld1.64 {d22, d23}, [r1]
add r1, sp, #8
vzip.8 d18, d19
vorr q8, q9, q9
vld1.64 {d24, d25}, [r1]
add r1, sp, #24
vzip.8 q9, q8
vshl.i8 q9, q9, #7
vshl.i8 q8, q8, #7
vshr.s8 q9, q9, #7
vshr.s8 q8, q8, #7
vbsl q9, q12, q10
vld1.64 {d20, d21}, [r1]
vbsl q8, q10, q11
vst1.8 {d18, d19}, [r0:128]!
vst1.64 {d16, d17}, [r0:128]
pop {r11, pc}
```
```llvm
define dso_local void @bad_bsl(ptr noalias nocapture nonnull writeonly sret(<32 x i8>) %0, i64 %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, <16 x i8> %5) local_unnamed_addr {
Entry:
%6 = shl i64 %1, 24
%7 = or i64 %6, %1
%8 = shl i64 %7, 12
%9 = or i64 %8, %7
%10 = and i64 %9, 4222189076152335
%11 = mul nuw nsw i64 %10, 65
%12 = mul nuw nsw i64 %10, 520
%13 = or i64 %12, %11
%14 = and i64 %13, 1229782938247303441
%15 = bitcast i64 %14 to <8 x i8>
%16 = and <8 x i8> %15, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%17 = lshr <8 x i8> %15, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%18 = shufflevector <8 x i8> %16, <8 x i8> %17, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
%19 = icmp ne <16 x i8> %18, zeroinitializer
%20 = zext <16 x i1> %19 to <16 x i8>
%21 = shufflevector <16 x i8> %20, <16 x i8> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
%.sroa.09.0.vec.extract = shufflevector <32 x i8> %21, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%22 = trunc <16 x i8> %.sroa.09.0.vec.extract to <16 x i1>
%23 = select <16 x i1> %22, <16 x i8> %2, <16 x i8> %4
%.sroa.09.16.vec.extract = shufflevector <32 x i8> %21, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%24 = trunc <16 x i8> %.sroa.09.16.vec.extract to <16 x i1>
%25 = select <16 x i1> %24, <16 x i8> %3, <16 x i8> %5
%.sroa.010.0.vecblend = shufflevector <16 x i8> %23, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%.sroa.010.16.vec.expand = shufflevector <16 x i8> %25, <16 x i8> poison, <32 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%.sroa.010.16.vecblend = shufflevector <32 x i8> %.sroa.010.0.vecblend, <32 x i8> %.sroa.010.16.vec.expand, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
store <32 x i8> %.sroa.010.16.vecblend, ptr %0, align 32
ret void
}
declare void @llvm.dbg.value(metadata, metadata, metadata) #1
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzcWs1v2zgW_2uYC1FDJEVZPuSQOA0wlx1gtxjsLaAkOtaW-ihJOXYO87cvSErUt5O0nUV3jNZmHh9_7_F9krKZUvlzyfktoPeAPtywRh8refsHE3nG5NebpMoutyAK3L_X_BkEDyC4S6tSaah0BgF5gCAM8qKupAY4BhgrnQGMAd4Bcu_Y3fuhhPxcszKLz038pULRuQnv1P1F8z94qisJcHziKSB3BtCTYoD3sIkNHN6NZlDkpyDYtqIghFBy3cjSqLdReZFt8lJzKVjKAY43YHsPTzyFAEcGjSmj9Iq8veFQtWBmZ8H50Slh6BaBfAbk85AldKo8jPa-fZgYQXHRb5WVF32p-XRvIAwM-bfyULm_vlxq_vuhXWfV2DjmjeDlkh2ciw5MKP60YNaR8ZwP_UaH6ht7OigtG4tk2f8cAM8tb8C44KlBa-LWXuajwzDjHuGqtRIlsNs2GtnLYuJ2W0ProN5FZHV6t0RejKJ22sWLHWMI6GcH30J1f17dxa-yCSvR8u0N_c_hBLniC342KQ7NZlj25DZ0Nro1UWhUTZrDgUv0dGTigK6E25QXv4sXfwAXvwN3nG4Er5SSttCZzNDcVbtEfLXIQzbzAvgR4Eeo5QXqCrJTlWfwqHWtALlzc8-5PjbJJq0KgB-FOHUfn2pZ_cdmy2OuVMMVwI87jBEaCzgxCS-twa0mZ0DuO7GqlpxlsGo05Cw9wiTXsDpAEAVnEAUwL41OsMyTRPB24mKK-tIWXvNneCi0EVUdDossSjOpYV2pXOdVabYIgyT44AtNXn0pubiahONLF6aGsgdkD3HYVuF-7zZ618V38G_NQ33MFbz7xwPMFaxqnRf5K7cmHZvginIId8E102iqwXv-fpdBYHTdIB3sxz9_gkEgmRoE_cj_3iALoVqOtUokZ18hIHeJ-GryZpCuprKNzyaDXLeNq5JPaVXUkivVpr3r2m8dYEAYJLneM2VaX1c2vAlQ22lnh45JGZs2486_a7ra_AP0HgP68EaTH2i3ejxaMIHBWbSMO-zMznrjw0Av9LqSs5rqukxL63dL7wMDgvdjbrti1IU8x7iJTBc6s18RPXgNtUAjLcZNbV05vMIx0mL1FNmdxYfW3ldFnQuTmiVkTKbHKDRNZ8DhlzFVOErXxDs-WElp1TjbEDwH_btQAgJMcDjWd7Jg8N4uQLhdwMpsxgkwCc6HIAj8_w-iR8t6zxnJGHhVmVk7Gi07FNXJfGZhu2w0W1Sn3PZouomTFhHRZcGe5xR2A0sZMTfqaLfV84Q98sQRr3mNWl4UdYDdyNGm_LhVZMy1zj_Fv8qvjmKRF2CyHXOmhdBryMYnk5TMD_AUeE78lg4WGc9tsqLHgBVfV8ICI89NJuvGiugafrNp9A1ZRHp_jk3dGHFJrq_k928HeKkaqF5ynR6hPnJzfDPJGwWm0X76ZO-hloQNrWSFOUTu7apnrj9eDJaT8GfWhLey8WeUhvfy_8QK8TcoD2mhlf5Auv_V5eQjaX81O-mA36Slef1oajJZEAyPTNm0VKzgsJZVIngBX3Ltk5WVGcCPrMxKk58ZP-SlvTm1VzGXyxv4xXH_-_d_TtheqkZkFpkl4gKP7MTnAqsy5T925Yy25qgvuarEiWeb7ysadaOO0J6y7yWyhhXSn19sylmz2hlJ2phdKyQS2au5HLxPK8mpqE6bPIaZO0m66rGGZmHQEEyaFfEsX1-M1suZWlQn27VWGRbFkakBSD-5UBg9Bu7tNFIZB_P60WHi1urLiGSASN4um0NzSbRctIY7XZWLV-SSyUluZDa0rqD1kt-QKXwk2s1lr8KRZbdegfPbHQblSvdYdH77OnWHSbRdsC3Lhgiq7tQJps-DnKQM2UesLZbJgjGbOspNY9JjN2Cbt4mTyNAmCk3aZthWxQwjew2xlVGiWWVc0ZNOFRgCYwdMvge4S9LTa15v4sHGJ146tYH0zU5_27Xv60qFTin6PUpNC1enXCvW66yOwhQpr83iOdRzxe3iNS65Ue_AclzXsRJXjlosVzm-oS7UPh4UHjDukOzH9Fmm0mgTW9jeiT1sAMgdwrYX46WVrUau4JuIXlw67kpVPehIdXr9Ru2Jtmdaku3GHGaqehJVygS0T3hBGPSPw2stYVkxkTMFyypltW4kh2VVlo0Q8EXmmleluEBlzhQ4BmRPMDzDPAbks3tCRK25crM_TF1rIHsUdUyGihepZJEaLlKpkWX38NSU5q6QPbEsk_0j7c-llpe-rwNMI_cM7ChGuvWxDzDdWpZKdhxtM6ZowBNPYWwt6lsfwHQ3gWnjlm4HPCiwTKb-tVw2ekOMMYp3wTZCFBNChyuQXVE0ApbNCyzVi9-ItXg0YsbXmWnfeg2NTDR2OWSFDtnCqc6IuM3j3TbGOxLjcEsCEoajVdR955DrlCntV4ZQV8avsY-ewZLICxpyOLQ2HvIYuvGPDMZSnfuFuZdcExu26z_-ORbXRVJzOAh-sk_k5nKjVu6Yuh0lBcGWTvY5wdBlH8Ew7gaoG-y6AfZTnpt4kmcPPcnzU0_y_JEnef6tJ00t7DIjT4salnye1a6OvnJZ2UsDE_krl4P12CXNKz_rfjHqFu_aiOoxh0vRsrUnhSmYV5u6ylVVtnRX7JYNHswMjmYGxzN7k5m5w5m16czY0czW25nf45nfd3O_D0ZoYYQXRmRhFC6M6FIYAEw3SlZsE-w2webE0w0_a8lSveyevrk4J47cMHfPtXyYO2Nu-rmh52ZdMuKSwZaMM8kH7Kq0lk2ZzoNxxU7DIEcTPNJ9z8PThQzBy313mTrsi14TFP0vXIb6mtKXkr6c9WXMOwD3zu1TzTsAewdg72DsZWAvA3sZ2MsgfYWcGjt8l_MmJrvmPXrde8snoeVTE52lGwpcHCWCl9m7auEC8rtr4a-RbO2o1_pv9tcofHo_-5Cr2TtdTT_u6l_CAH_hX79YKF919ZWcHlfjpVowr8_rwfT_lvmhhws9HvV41OPR_oTr8Wivbn8Q8wpTrzH1MqiXEXkZkZcReRkRGbhT6Urytx3gXWUu6N0lm4n8uYTE3z0l1_ZK3z4AGP_4LeOpYJL7O78Qp2KTJc-bExMNBzguuGYZ08wgL4939invu0DbiY_Ddk8tbrJbku3Ijt3wW7RFdBsTitHN8TbjhEU0PRwSGiYkTkgSpUHI-TbhaIu24U1-iwMcBhRRFBJE6SaIw3C3S0MS8YgGbAfCgBcsFxurbSWfb-wXCLc7jKPdjWAJF8r-mhjjkr9AOwkwBvThRt7arx6S5lmZ3eZKqx5F51rYnyF3P2CgD_BfdSPzqvE_PmKDr0zst1QA3yf5oX2PAntAiAJ1FADfp4Xo52ekKLhppLj9sW9Ldv8NAAD__9WDSxQ">