[llvm] [AArch64] Fix widening error for masked load/store integer scalable ve… (PR #99354)
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 22 01:51:26 PDT 2024
================
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @store_i8(<vscale x 1 x i1> %pred, ptr %x, i64 %base, <vscale x 1 x i8> %val) #0 {
+; CHECK-LABEL: store_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d
+; CHECK-NEXT: uzp1 p1.d, p1.d, p1.d
+; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s
+; CHECK-NEXT: uzp1 p1.s, p1.s, p1.s
+; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h
+; CHECK-NEXT: uzp1 p1.h, p1.h, p1.h
+; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b
+; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.masked.store.nxv1i8.p0nxv1i8(<vscale x 1 x i8> %val, ptr %x, i32 1, <vscale x 1 x i1> %pred)
+ ret void
+}
+
+define void @store_i16(<vscale x 1 x i1> %pred, ptr %x, i64 %base, <vscale x 1 x i16> %val) #0 {
+; CHECK-LABEL: store_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d
+; CHECK-NEXT: uzp1 p1.d, p1.d, p1.d
+; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s
+; CHECK-NEXT: uzp1 p1.s, p1.s, p1.s
+; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.masked.store.nxv1i16.p0nxv1i16(<vscale x 1 x i16> %val, ptr %x, i32 1, <vscale x 1 x i1> %pred)
+ ret void
+}
+
+define void @store_i32(<vscale x 1 x i1> %pred, ptr %x, i64 %base, <vscale x 1 x i32> %val) #0 {
+; CHECK-LABEL: store_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d
+; CHECK-NEXT: uzp1 p1.d, p1.d, p1.d
+; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.masked.store.nxv1i32.p0nxv1i32(<vscale x 1 x i32> %val, ptr %x, i32 1, <vscale x 1 x i1> %pred)
+ ret void
+}
+
+define void @store_i64(<vscale x 1 x i1> %pred, ptr %x, i64 %base, <vscale x 1 x i64> %val) #0 {
+; CHECK-LABEL: store_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
----------------
sdesmalen-arm wrote:
Looking more closely at the tests now, I actually don't think this is correct.
If we would have a masked.store writing `<vscale x 1 x i64> splat(i64 42)` to some pointer with an all true mask, then for vscale = 2 we would expect it to write in memory the sequence: `42, 42`. With this patch, the operation would write: `42, <unchanged>, 42, <unchanged>`.
An unpacked data type of `<vscale x 1 x i64>` within a packed data type `<vscale x 2 x i64>` has the following layout:
```
vscale=1: <value, _>
vscale=2: <value, _ | value, _>
vscale=4: <value, _ | value, _ | value, _ | value, _>
```
What the widening for scalable vectors for a store would look like (using this example), is:
* Transforming a vector of `<value, _ | value, _>` into `<value, value | _, _>` (e.g. for the case where vscale=2).
* Transforming the mask in a similar way, so that the mask would be `<mask, mask | 0, 0>`.
* Doing a masked store of double words.
We'd ideally want to use `st1d { z0.q }, p0, [x0]`, but that instruction does not exist, so the widening would need to emulate that operation using `st1d {z0.d}`.
https://github.com/llvm/llvm-project/pull/99354
More information about the llvm-commits
mailing list