[llvm] [WebAssembly] Enable interleaved memory accesses (PR #125696)
Sam Parker via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 4 07:18:07 PST 2025
https://github.com/sparker-arm created https://github.com/llvm/llvm-project/pull/125696
Enable the vectorizer to access interleaved memory. This means that, when it's decided to be profitable, the memory accesses can be vectorized instead of the value being built up by a sequence of load_lane instructions. This will often increase the vectorization factor of the loop, leading to significantly better performance.
I run a reasonably large collection of benchmarks and most are not affected by this change, with most performance changes <1%. But I see a 2.5% speedup for the total run time of TSVC, 1% speedup for SPEC2017 x265, 28% speedup for a ResNet workload and 95% for libyuv. This is running V8 on an AArch64 box.
>From a812e9207f7afebd23d0f92534e1315e0505cd09 Mon Sep 17 00:00:00 2001
From: Samuel Parker <sam.parker at arm.com>
Date: Tue, 4 Feb 2025 14:51:16 +0000
Subject: [PATCH] [WebAssembly] Enable interleaved memory accesses
Enable the vectorizer to access interleaved memory. This means that,
when it's decided to be profitable, the memory accesses can be
vectorized instead of the value being built up by a sequence of
load_lane instructions. This will often increase the vectorization
factor of the loop leading to significantly better performance.
I run a reasonably large collection of benchmarks and most are not
affected by this change, with most performance changes <1%. But I see
a 2.5% speedup for the total run time of TSVC, 1% speedup for SPEC2017
x265, 28% speedup for an ML workload and 95% for libyuv.
---
.../WebAssemblyTargetTransformInfo.h | 2 +
llvm/test/CodeGen/WebAssembly/interleave.ll | 361 ++++++++++++++++++
2 files changed, 363 insertions(+)
create mode 100644 llvm/test/CodeGen/WebAssembly/interleave.ll
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 4d498b154c5214..ba66306374c6cc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -57,6 +57,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
/// \name Vector TTI Implementations
/// @{
+ bool enableInterleavedAccessVectorization() { return true; }
+
unsigned getNumberOfRegisters(unsigned ClassID) const;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
InstructionCost getArithmeticInstrCost(
diff --git a/llvm/test/CodeGen/WebAssembly/interleave.ll b/llvm/test/CodeGen/WebAssembly/interleave.ll
new file mode 100644
index 00000000000000..c20b5e42c48509
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/interleave.ll
@@ -0,0 +1,361 @@
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
+
+target triple = "wasm32"
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-n32:64-S128-ni:1:10:20"
+
+%struct.Output32x2 = type { i32, i32 }
+%struct.Input8x2 = type { i8, i8 }
+%struct.Output32x4 = type { i32, i32, i32, i32 }
+%struct.Input8x4 = type { i8, i8, i8, i8 }
+%struct.Input16x2 = type { i16, i16 }
+%struct.Input16x4 = type { i16, i16, i16, i16 }
+%struct.Input32x2 = type { i32, i32 }
+%struct.Input32x4 = type { i32, i32, i32, i32 }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate8x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate8x2:
+; CHECK: loop
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+ %4 = load i32, ptr %0, align 4
+ %5 = icmp eq i32 %2, 0
+ br i1 %5, label %10, label %6
+
+6: ; preds = %3
+ %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+ %8 = load i32, ptr %7, align 4
+ br label %12
+
+9: ; preds = %12
+ store i32 %23, ptr %7, align 4
+ br label %10
+
+10: ; preds = %9, %3
+ %11 = phi i32 [ %21, %9 ], [ %4, %3 ]
+ store i32 %11, ptr %0, align 4
+ ret void
+
+12: ; preds = %6, %12
+ %13 = phi i32 [ %8, %6 ], [ %23, %12 ]
+ %14 = phi i32 [ 0, %6 ], [ %24, %12 ]
+ %15 = phi i32 [ %4, %6 ], [ %21, %12 ]
+ %16 = getelementptr inbounds nuw %struct.Input8x2, ptr %1, i32 %14
+ %17 = load i8, ptr %16, align 1
+ %18 = getelementptr inbounds nuw i8, ptr %16, i32 1
+ %19 = load i8, ptr %18, align 1
+ %20 = zext i8 %17 to i32
+ %21 = add i32 %15, %20
+ %22 = zext i8 %19 to i32
+ %23 = add i32 %13, %22
+ %24 = add nuw i32 %14, 1
+ %25 = icmp eq i32 %24, %2
+ br i1 %25, label %9, label %12
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate8x4(ptr dead_on_unwind noalias writable sret(%struct.Output32x4) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate8x4
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+ %4 = load i32, ptr %0, align 4
+ %5 = icmp eq i32 %2, 0
+ br i1 %5, label %14, label %6
+
+6: ; preds = %3
+ %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+ %8 = getelementptr inbounds nuw i8, ptr %0, i32 8
+ %9 = getelementptr inbounds nuw i8, ptr %0, i32 12
+ %10 = load i32, ptr %7, align 4
+ %11 = load i32, ptr %8, align 4
+ %12 = load i32, ptr %9, align 4
+ br label %16
+
+13: ; preds = %16
+ store i32 %33, ptr %7, align 4
+ store i32 %35, ptr %8, align 4
+ store i32 %37, ptr %9, align 4
+ br label %14
+
+14: ; preds = %13, %3
+ %15 = phi i32 [ %31, %13 ], [ %4, %3 ]
+ store i32 %15, ptr %0, align 4
+ ret void
+
+16: ; preds = %6, %16
+ %17 = phi i32 [ %12, %6 ], [ %37, %16 ]
+ %18 = phi i32 [ %11, %6 ], [ %35, %16 ]
+ %19 = phi i32 [ %10, %6 ], [ %33, %16 ]
+ %20 = phi i32 [ 0, %6 ], [ %38, %16 ]
+ %21 = phi i32 [ %4, %6 ], [ %31, %16 ]
+ %22 = getelementptr inbounds nuw %struct.Input8x4, ptr %1, i32 %20
+ %23 = load i8, ptr %22, align 1
+ %24 = getelementptr inbounds nuw i8, ptr %22, i32 1
+ %25 = load i8, ptr %24, align 1
+ %26 = getelementptr inbounds nuw i8, ptr %22, i32 2
+ %27 = load i8, ptr %26, align 1
+ %28 = getelementptr inbounds nuw i8, ptr %22, i32 3
+ %29 = load i8, ptr %28, align 1
+ %30 = zext i8 %23 to i32
+ %31 = add i32 %21, %30
+ %32 = zext i8 %25 to i32
+ %33 = add i32 %19, %32
+ %34 = zext i8 %27 to i32
+ %35 = add i32 %18, %34
+ %36 = zext i8 %29 to i32
+ %37 = add i32 %17, %36
+ %38 = add nuw i32 %20, 1
+ %39 = icmp eq i32 %38, %2
+ br i1 %39, label %13, label %16
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate16x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate16x2
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+ %4 = load i32, ptr %0, align 4
+ %5 = icmp eq i32 %2, 0
+ br i1 %5, label %10, label %6
+
+6: ; preds = %3
+ %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+ %8 = load i32, ptr %7, align 4
+ br label %12
+
+9: ; preds = %12
+ store i32 %23, ptr %7, align 4
+ br label %10
+
+10: ; preds = %9, %3
+ %11 = phi i32 [ %21, %9 ], [ %4, %3 ]
+ store i32 %11, ptr %0, align 4
+ ret void
+
+12: ; preds = %6, %12
+ %13 = phi i32 [ %8, %6 ], [ %23, %12 ]
+ %14 = phi i32 [ 0, %6 ], [ %24, %12 ]
+ %15 = phi i32 [ %4, %6 ], [ %21, %12 ]
+ %16 = getelementptr inbounds nuw %struct.Input16x2, ptr %1, i32 %14
+ %17 = load i16, ptr %16, align 2
+ %18 = getelementptr inbounds nuw i8, ptr %16, i32 2
+ %19 = load i16, ptr %18, align 2
+ %20 = zext i16 %17 to i32
+ %21 = add i32 %15, %20
+ %22 = zext i16 %19 to i32
+ %23 = add i32 %13, %22
+ %24 = add nuw i32 %14, 1
+ %25 = icmp eq i32 %24, %2
+ br i1 %25, label %9, label %12
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate16x4(ptr dead_on_unwind noalias writable sret(%struct.Output32x4) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate16x4
+; CHECK: loop
+; CHECK: v128.load 0:p2align=1
+; CHECK: v128.load 16:p2align=1
+; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+ %4 = load i32, ptr %0, align 4
+ %5 = icmp eq i32 %2, 0
+ br i1 %5, label %14, label %6
+
+6: ; preds = %3
+ %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+ %8 = getelementptr inbounds nuw i8, ptr %0, i32 8
+ %9 = getelementptr inbounds nuw i8, ptr %0, i32 12
+ %10 = load i32, ptr %7, align 4
+ %11 = load i32, ptr %8, align 4
+ %12 = load i32, ptr %9, align 4
+ br label %16
+
+13: ; preds = %16
+ store i32 %33, ptr %7, align 4
+ store i32 %35, ptr %8, align 4
+ store i32 %37, ptr %9, align 4
+ br label %14
+
+14: ; preds = %13, %3
+ %15 = phi i32 [ %31, %13 ], [ %4, %3 ]
+ store i32 %15, ptr %0, align 4
+ ret void
+
+16: ; preds = %6, %16
+ %17 = phi i32 [ %12, %6 ], [ %37, %16 ]
+ %18 = phi i32 [ %11, %6 ], [ %35, %16 ]
+ %19 = phi i32 [ %10, %6 ], [ %33, %16 ]
+ %20 = phi i32 [ 0, %6 ], [ %38, %16 ]
+ %21 = phi i32 [ %4, %6 ], [ %31, %16 ]
+ %22 = getelementptr inbounds nuw %struct.Input16x4, ptr %1, i32 %20
+ %23 = load i16, ptr %22, align 2
+ %24 = getelementptr inbounds nuw i8, ptr %22, i32 2
+ %25 = load i16, ptr %24, align 2
+ %26 = getelementptr inbounds nuw i8, ptr %22, i32 4
+ %27 = load i16, ptr %26, align 2
+ %28 = getelementptr inbounds nuw i8, ptr %22, i32 6
+ %29 = load i16, ptr %28, align 2
+ %30 = zext i16 %23 to i32
+ %31 = add i32 %21, %30
+ %32 = zext i16 %25 to i32
+ %33 = add i32 %19, %32
+ %34 = zext i16 %27 to i32
+ %35 = add i32 %18, %34
+ %36 = zext i16 %29 to i32
+ %37 = add i32 %17, %36
+ %38 = add nuw i32 %20, 1
+ %39 = icmp eq i32 %38, %2
+ br i1 %39, label %13, label %16
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate32x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate32x2
+; CHECK: loop
+; CHECK: v128.load 0:p2align=2
+; CHECK: v128.load 16:p2align=2
+; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i32x4.add
+ %4 = load i32, ptr %0, align 4
+ %5 = icmp eq i32 %2, 0
+ br i1 %5, label %10, label %6
+
+6: ; preds = %3
+ %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+ %8 = load i32, ptr %7, align 4
+ br label %12
+
+9: ; preds = %12
+ store i32 %21, ptr %7, align 4
+ br label %10
+
+10: ; preds = %9, %3
+ %11 = phi i32 [ %20, %9 ], [ %4, %3 ]
+ store i32 %11, ptr %0, align 4
+ ret void
+
+12: ; preds = %6, %12
+ %13 = phi i32 [ %8, %6 ], [ %21, %12 ]
+ %14 = phi i32 [ 0, %6 ], [ %22, %12 ]
+ %15 = phi i32 [ %4, %6 ], [ %20, %12 ]
+ %16 = getelementptr inbounds nuw %struct.Input32x2, ptr %1, i32 %14
+ %17 = load i32, ptr %16, align 4
+ %18 = getelementptr inbounds nuw i8, ptr %16, i32 4
+ %19 = load i32, ptr %18, align 4
+ %20 = add i32 %15, %17
+ %21 = add i32 %13, %19
+ %22 = add nuw i32 %14, 1
+ %23 = icmp eq i32 %22, %2
+ br i1 %23, label %9, label %12
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate32x4(ptr dead_on_unwind noalias writable sret(%struct.Output32x4) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate32x4
+; CHECK: v128.load 0:p2align=2
+; CHECK: v128.load 16:p2align=2
+; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load 32:p2align=2
+; CHECK: v128.load 48:p2align=2
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.add
+ %4 = load i32, ptr %0, align 4
+ %5 = icmp eq i32 %2, 0
+ br i1 %5, label %14, label %6
+
+6: ; preds = %3
+ %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+ %8 = getelementptr inbounds nuw i8, ptr %0, i32 8
+ %9 = getelementptr inbounds nuw i8, ptr %0, i32 12
+ %10 = load i32, ptr %7, align 4
+ %11 = load i32, ptr %8, align 4
+ %12 = load i32, ptr %9, align 4
+ br label %16
+
+13: ; preds = %16
+ store i32 %31, ptr %7, align 4
+ store i32 %32, ptr %8, align 4
+ store i32 %33, ptr %9, align 4
+ br label %14
+
+14: ; preds = %13, %3
+ %15 = phi i32 [ %30, %13 ], [ %4, %3 ]
+ store i32 %15, ptr %0, align 4
+ ret void
+
+16: ; preds = %6, %16
+ %17 = phi i32 [ %12, %6 ], [ %33, %16 ]
+ %18 = phi i32 [ %11, %6 ], [ %32, %16 ]
+ %19 = phi i32 [ %10, %6 ], [ %31, %16 ]
+ %20 = phi i32 [ 0, %6 ], [ %34, %16 ]
+ %21 = phi i32 [ %4, %6 ], [ %30, %16 ]
+ %22 = getelementptr inbounds nuw %struct.Input32x4, ptr %1, i32 %20
+ %23 = load i32, ptr %22, align 4
+ %24 = getelementptr inbounds nuw i8, ptr %22, i32 4
+ %25 = load i32, ptr %24, align 4
+ %26 = getelementptr inbounds nuw i8, ptr %22, i32 8
+ %27 = load i32, ptr %26, align 4
+ %28 = getelementptr inbounds nuw i8, ptr %22, i32 12
+ %29 = load i32, ptr %28, align 4
+ %30 = add i32 %21, %23
+ %31 = add i32 %19, %25
+ %32 = add i32 %18, %27
+ %33 = add i32 %17, %29
+ %34 = add nuw i32 %20, 1
+ %35 = icmp eq i32 %34, %2
+ br i1 %35, label %13, label %16
+}
More information about the llvm-commits
mailing list