[llvm] 166968a - [AArch64] Add test cases where zext can be lowered to series of tbl.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 25 07:36:51 PST 2022
Author: Florian Hahn
Date: 2022-02-25T15:36:32Z
New Revision: 166968a8926aef59450f2679e040bc678f561726
URL: https://github.com/llvm/llvm-project/commit/166968a8926aef59450f2679e040bc678f561726
DIFF: https://github.com/llvm/llvm-project/commit/166968a8926aef59450f2679e040bc678f561726.diff
LOG: [AArch64] Add test cases where zext can be lowered to series of tbl.
Add a set of tests for upcoming patches that allow lowering vector zext
using AArch64 tbl instructions instead of shifts.
Added:
llvm/test/CodeGen/AArch64/zext-to-tbl.ll
llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
new file mode 100644
index 0000000000000..895acc51287fc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -0,0 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -o - %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+; It's profitable to convert the zext to a shuffle, which in turn will be
+; lowered to 4 tbl instructions. The masks are materialized outside the loop.
+define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB0_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll2.8h v1, v0, #0
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v2, v1, #0
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: ushll2.4s v3, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: stp q1, q2, [x1, #32]
+; CHECK-NEXT: stp q0, q3, [x1], #64
+; CHECK-NEXT: b.ne LBB0_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %src.gep.cast
+ %ext = zext <16 x i8> %load to <16 x i32>
+ %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+ %dst.gep.cast = bitcast i32* %dst.gep to <16 x i32>*
+ store <16 x i32> %ext, <16 x i32>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; Not profitable to use shuffle/tbl, as 4 tbls + materializing the masks
+; require more instructions than lowering zext directly.
+define void @zext_v16i8_to_v16i32_no_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL: zext_v16i8_to_v16i32_no_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ushll2.8h v1, v0, #0
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v2, v1, #0
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: ushll2.4s v3, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: stp q1, q2, [x1, #32]
+; CHECK-NEXT: stp q0, q3, [x1]
+; CHECK-NEXT: ret
+entry:
+ %src.cast = bitcast i8* %src to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %src.cast
+ %ext = zext <16 x i8> %load to <16 x i32>
+ %dst.cast = bitcast i32* %dst to <16 x i32>*
+ store <16 x i32> %ext, <16 x i32>* %dst.cast
+ ret void
+}
+
+define void @zext_v16i8_to_v16i16_in_loop(i8* %src, i16* %dst) {
+; CHECK-LABEL: zext_v16i8_to_v16i16_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB2_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll2.8h v1, v0, #0
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: stp q0, q1, [x1], #32
+; CHECK-NEXT: b.ne LBB2_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %src.gep.cast
+ %ext = zext <16 x i8> %load to <16 x i16>
+ %dst.gep = getelementptr i16, i16* %dst, i64 %iv
+ %dst.gep.cast = bitcast i16* %dst.gep to <16 x i16>*
+ store <16 x i16> %ext, <16 x i16>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB3_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: stp q0, q1, [x1], #64
+; CHECK-NEXT: b.ne LBB3_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+ %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+ %ext = zext <8 x i8> %load to <8 x i32>
+ %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+ %dst.gep.cast = bitcast i32* %dst.gep to <8 x i32>*
+ store <8 x i32> %ext, <8 x i32>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v16i8_to_v16i64_in_loop(i8* %src, i64* %dst) {
+; CHECK-LABEL: zext_v16i8_to_v16i64_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB4_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x0, x8]
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: cmp x8, #128
+; CHECK-NEXT: ushll.8h v1, v0, #0
+; CHECK-NEXT: ushll2.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v2, v1, #0
+; CHECK-NEXT: ushll2.4s v3, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ushll2.2d v4, v3, #0
+; CHECK-NEXT: ushll2.2d v5, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ushll.2d v3, v3, #0
+; CHECK-NEXT: stp q0, q5, [x1, #64]
+; CHECK-NEXT: ushll.4s v0, v1, #0
+; CHECK-NEXT: stp q3, q4, [x1, #96]
+; CHECK-NEXT: ushll2.2d v3, v2, #0
+; CHECK-NEXT: ushll.2d v2, v2, #0
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: stp q2, q3, [x1, #32]
+; CHECK-NEXT: stp q0, q1, [x1], #128
+; CHECK-NEXT: b.ne LBB4_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %src.gep.cast
+ %ext = zext <16 x i8> %load to <16 x i64>
+ %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+ %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
+ store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll
new file mode 100644
index 0000000000000..77dfcd2f320e0
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll
@@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -codegenprepare -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+; It's profitable to convert the zext to a shuffle, which in turn will be
+; lowered to 4 tbl instructions. The masks are materialized outside the loop.
+define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL: @zext_v16i8_to_v16i32_in_loop(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>*
+; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16
+; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32>
+; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[DST_GEP_CAST:%.*]] = bitcast i32* [[DST_GEP]] to <16 x i32>*
+; CHECK-NEXT: store <16 x i32> [[EXT]], <16 x i32>* [[DST_GEP_CAST]], align 64
+; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %src.gep.cast
+ %ext = zext <16 x i8> %load to <16 x i32>
+ %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+ %dst.gep.cast = bitcast i32* %dst.gep to <16 x i32>*
+ store <16 x i32> %ext, <16 x i32>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; Not profitable to use shuffle/tbl, as 4 tbls + materializing the masks
+; require more instructions than lowering zext directly.
+define void @zext_v16i8_to_v16i32_no_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL: @zext_v16i8_to_v16i32_no_loop(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SRC_CAST:%.*]] = bitcast i8* [[SRC:%.*]] to <16 x i8>*
+; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_CAST]], align 16
+; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32>
+; CHECK-NEXT: [[DST_CAST:%.*]] = bitcast i32* [[DST:%.*]] to <16 x i32>*
+; CHECK-NEXT: store <16 x i32> [[EXT]], <16 x i32>* [[DST_CAST]], align 64
+; CHECK-NEXT: ret void
+;
+entry:
+ %src.cast = bitcast i8* %src to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %src.cast
+ %ext = zext <16 x i8> %load to <16 x i32>
+ %dst.cast = bitcast i32* %dst to <16 x i32>*
+ store <16 x i32> %ext, <16 x i32>* %dst.cast
+ ret void
+}
+
+define void @zext_v16i8_to_v16i16_in_loop(i8* %src, i16* %dst) {
+; CHECK-LABEL: @zext_v16i8_to_v16i16_in_loop(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>*
+; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16
+; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i16>
+; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i16, i16* [[DST:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[DST_GEP_CAST:%.*]] = bitcast i16* [[DST_GEP]] to <16 x i16>*
+; CHECK-NEXT: store <16 x i16> [[EXT]], <16 x i16>* [[DST_GEP_CAST]], align 32
+; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %src.gep.cast
+ %ext = zext <16 x i8> %load to <16 x i16>
+ %dst.gep = getelementptr i16, i16* %dst, i64 %iv
+ %dst.gep.cast = bitcast i16* %dst.gep to <16 x i16>*
+ store <16 x i16> %ext, <16 x i16>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL: @zext_v8i8_to_v8i32_in_loop(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <8 x i8>*
+; CHECK-NEXT: [[LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[SRC_GEP_CAST]], align 8
+; CHECK-NEXT: [[EXT:%.*]] = zext <8 x i8> [[LOAD]] to <8 x i32>
+; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[DST_GEP_CAST:%.*]] = bitcast i32* [[DST_GEP]] to <8 x i32>*
+; CHECK-NEXT: store <8 x i32> [[EXT]], <8 x i32>* [[DST_GEP_CAST]], align 32
+; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+ %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+ %ext = zext <8 x i8> %load to <8 x i32>
+ %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+ %dst.gep.cast = bitcast i32* %dst.gep to <8 x i32>*
+ store <8 x i32> %ext, <8 x i32>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @zext_v16i8_to_v16i64_in_loop(i8* %src, i64* %dst) {
+; CHECK-LABEL: @zext_v16i8_to_v16i64_in_loop(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>*
+; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16
+; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i64>
+; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i64, i64* [[DST:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[DST_GEP_CAST:%.*]] = bitcast i64* [[DST_GEP]] to <16 x i64>*
+; CHECK-NEXT: store <16 x i64> [[EXT]], <16 x i64>* [[DST_GEP_CAST]], align 128
+; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %src.gep = getelementptr i8, i8* %src, i64 %iv
+ %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
+ %load = load <16 x i8>, <16 x i8>* %src.gep.cast
+ %ext = zext <16 x i8> %load to <16 x i64>
+ %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+ %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
+ store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
+ %iv.next = add nuw i64 %iv, 16
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list