[llvm] 68f4579 - [AARCH64][SVE] Do not optimize vector conversions

Thu Jan 19 08:50:41 PST 2023

Author: Zino Benaissa
Date: 2023-01-19T16:50:31Z
New Revision: 68f45796edbdc4331a6be0b6e9d58f01d8b1fba0

URL: https://github.com/llvm/llvm-project/commit/68f45796edbdc4331a6be0b6e9d58f01d8b1fba0
DIFF: https://github.com/llvm/llvm-project/commit/68f45796edbdc4331a6be0b6e9d58f01d8b1fba0.diff

LOG: [AARCH64][SVE] Do not optimize vector conversions

shuffle_vector instructions are serialized targeting SVE fixed vectors, see
https://reviews.llvm.org/D139111. This patch disables
optimizeExtendOrTruncateConversion peepholes that generates shuffle_vector.

Differential Revision: https://reviews.llvm.org/D141439

Added: 
    llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 324d1cf0d007..eaf467d7831a 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14177,6 +14177,11 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
 
 bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
                                                                Loop *L) const {
+  // shuffle_vector instructions are serialized when targeting SVE,
+  // see LowerSPLAT_VECTOR. This peephole is not beneficial.
+  if (Subtarget->useSVEForFixedLengthVectors())
+    return false;
+
   // Try to optimize conversions using tbl. This requires materializing constant
   // index vectors, which can increase code size and add loads. Skip the
   // transform unless the conversion is in a loop block guaranteed to execute

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll
new file mode 100644
index 000000000000..844afc27907c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll
@@ -0,0 +1,59 @@
+
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=256 -verify-machineinstrs | FileCheck %s --check-prefixes=SVE256
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=128 -verify-machineinstrs | FileCheck %s --check-prefixes=NEON
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-n1 -O3 -opaque-pointers -verify-machineinstrs | FileCheck %s --check-prefixes=NEON
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v2 -O3 -opaque-pointers -verify-machineinstrs | FileCheck %s --check-prefixes=NEON
+
+define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) {
+; SVE256-LABEL: test:
+; SVE256:       ld1b	{ z0.h }, p0/z,
+; SVE256:       ld1b	{ z1.h }, p0/z,
+; SVE256:       sub	z0.h, z0.h, z1.h
+; SVE256-NEXT:  sunpklo	z1.s, z0.h
+; SVE256-NEXT:  ext	z0.b, z0.b, z0.b, #16
+; SVE256-NEXT:  sunpklo	z0.s, z0.h
+; SVE256-NEXT:  add	z0.s, z1.s, z0.s
+; SVE256-NEXT:  uaddv	d0, p1, z0.s
+
+; NEON-LABEL: test:
+; NEON:         tbl
+; NEON-NEXT:    tbl
+; NEON-NEXT:    tbl
+; NEON-NEXT:    tbl
+; NEON-NEXT:    tbl
+; NEON-NEXT:    tbl
+; NEON-NEXT:    tbl
+; NEON-NEXT:    tbl
+; NEON:         addv
+
+
+L.entry:
+  br label %L1
+
+L1:                                          ; preds = %L1, %L.entry
+  %a = phi i32 [ 16, %L.entry ], [ %14, %L1 ]
+  %b = phi i32 [ 0, %L.entry ], [ %13, %L1 ]
+  %i = phi i32 [ 0, %L.entry ], [ %12, %L1 ]
+  %0 = mul i32 %b, %i1
+  %1 = sext i32 %0 to i64
+  %2 = getelementptr i8, ptr %p1, i64 %1
+  %3 = mul i32 %b, %i2
+  %4 = sext i32 %3 to i64
+  %5 = getelementptr i8, ptr %p2, i64 %4
+  %6 = load <16 x i8>, ptr %2, align 1
+  %7 = zext <16 x i8> %6 to <16 x i32>
+  %8 = load <16 x i8>, ptr %5, align 1
+  %9 = zext <16 x i8> %8 to <16 x i32>
+  %10 = sub nsw <16 x i32> %7, %9
+  %11 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %10)
+  %12 = add i32 %11, %i
+  %13 = add nuw nsw i32 %b, 1
+  %14 = add nsw i32 %a, -1
+  %.not = icmp eq i32 %14, 0
+  br i1 %.not, label %L2, label %L1
+
+L2:                                          ; preds = %L1
+  ret i32 %12
+}
+
+declare  i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)