[llvm] [NFC][WebAssembly] CodeGen test (PR #189626)
Sam Parker via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 31 04:39:00 PDT 2026
https://github.com/sparker-arm created https://github.com/llvm/llvm-project/pull/189626
None
>From 20975eb17177910a943685a5338dfa7c5bffaa6d Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker at arm.com>
Date: Tue, 31 Mar 2026 12:36:29 +0100
Subject: [PATCH] [NFC][WebAssembly] CodeGen test
---
.../CodeGen/WebAssembly/strided-int-mac.ll | 379 ++++++++++++++++++
1 file changed, 379 insertions(+)
create mode 100644 llvm/test/CodeGen/WebAssembly/strided-int-mac.ll
diff --git a/llvm/test/CodeGen/WebAssembly/strided-int-mac.ll b/llvm/test/CodeGen/WebAssembly/strided-int-mac.ll
new file mode 100644
index 0000000000000..a283c617a3cfa
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/strided-int-mac.ll
@@ -0,0 +1,379 @@
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
+
+target triple = "wasm32"
+
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i32x4.extract_lane 3
+; CHECK: i32x4.extract_lane 2
+; CHECK: i32x4.extract_lane 1
+; CHECK: i32x4.extract_lane 0
+; CHECK: v128.load8_splat 0
+; CHECK: v128.load8_lane 0, 1
+; CHECK: v128.load8_lane 0, 2
+; CHECK: v128.load8_lane 0, 3
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: v128.load8_splat 0
+; CHECK: v128.load8_lane 0, 1
+; CHECK: v128.load8_lane 0, 2
+; CHECK: v128.load8_lane 0, 3
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: v128.load8_splat 0
+; CHECK: v128.load8_lane 0, 1
+; CHECK: v128.load8_lane 0, 2
+; CHECK: v128.load8_lane 0, 3
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: v128.load8_splat 0
+; CHECK: v128.load8_lane 0, 1
+; CHECK: v128.load8_lane 0, 2
+; CHECK: v128.load8_lane 0, 3
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extract_lane 3
+; MAX-BANDWIDTH: i32x4.extract_lane 2
+; MAX-BANDWIDTH: i32x4.extract_lane 1
+; MAX-BANDWIDTH: i32x4.extract_lane 0
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extract_lane 3
+; MAX-BANDWIDTH: i32x4.extract_lane 2
+; MAX-BANDWIDTH: i32x4.extract_lane 1
+; MAX-BANDWIDTH: i32x4.extract_lane 0
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extract_lane 3
+; MAX-BANDWIDTH: i32x4.extract_lane 2
+; MAX-BANDWIDTH: i32x4.extract_lane 1
+; MAX-BANDWIDTH: i32x4.extract_lane 0
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extract_lane 3
+; MAX-BANDWIDTH: i32x4.extract_lane 2
+; MAX-BANDWIDTH: i32x4.extract_lane 1
+; MAX-BANDWIDTH: i32x4.extract_lane 0
+; MAX-BANDWIDTH: v128.load8_splat 0
+; MAX-BANDWIDTH: v128.load8_lane 0, 1
+; MAX-BANDWIDTH: v128.load8_lane 0, 2
+; MAX-BANDWIDTH: v128.load8_lane 0, 3
+; MAX-BANDWIDTH: v128.load8_lane 0, 4
+; MAX-BANDWIDTH: v128.load8_lane 0, 5
+; MAX-BANDWIDTH: v128.load8_lane 0, 6
+; MAX-BANDWIDTH: v128.load8_lane 0, 7
+; MAX-BANDWIDTH: v128.load8_lane 0, 8
+; MAX-BANDWIDTH: v128.load8_lane 0, 9
+; MAX-BANDWIDTH: v128.load8_lane 0, 10
+; MAX-BANDWIDTH: v128.load8_lane 0, 11
+; MAX-BANDWIDTH: v128.load8_lane 0, 12
+; MAX-BANDWIDTH: v128.load8_lane 0, 13
+; MAX-BANDWIDTH: v128.load8_lane 0, 14
+; MAX-BANDWIDTH: v128.load8_lane 0, 15
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31
+; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load8_splat 0
+; MAX-BANDWIDTH: v128.load8_lane 0, 1
+; MAX-BANDWIDTH: v128.load8_lane 0, 2
+; MAX-BANDWIDTH: v128.load8_lane 0, 3
+; MAX-BANDWIDTH: v128.load8_lane 0, 4
+; MAX-BANDWIDTH: v128.load8_lane 0, 5
+; MAX-BANDWIDTH: v128.load8_lane 0, 6
+; MAX-BANDWIDTH: v128.load8_lane 0, 7
+; MAX-BANDWIDTH: v128.load8_lane 0, 8
+; MAX-BANDWIDTH: v128.load8_lane 0, 9
+; MAX-BANDWIDTH: v128.load8_lane 0, 10
+; MAX-BANDWIDTH: v128.load8_lane 0, 11
+; MAX-BANDWIDTH: v128.load8_lane 0, 12
+; MAX-BANDWIDTH: v128.load8_lane 0, 13
+; MAX-BANDWIDTH: v128.load8_lane 0, 14
+; MAX-BANDWIDTH: v128.load8_lane 0, 15
+; MAX-BANDWIDTH: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29
+; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30
+; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28
+; MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load8_splat 0
+; MAX-BANDWIDTH: v128.load8_lane 0, 1
+; MAX-BANDWIDTH: v128.load8_lane 0, 2
+; MAX-BANDWIDTH: v128.load8_lane 0, 3
+; MAX-BANDWIDTH: v128.load8_lane 0, 4
+; MAX-BANDWIDTH: v128.load8_lane 0, 5
+; MAX-BANDWIDTH: v128.load8_lane 0, 6
+; MAX-BANDWIDTH: v128.load8_lane 0, 7
+; MAX-BANDWIDTH: v128.load8_lane 0, 8
+; MAX-BANDWIDTH: v128.load8_lane 0, 9
+; MAX-BANDWIDTH: v128.load8_lane 0, 10
+; MAX-BANDWIDTH: v128.load8_lane 0, 11
+; MAX-BANDWIDTH: v128.load8_lane 0, 12
+; MAX-BANDWIDTH: v128.load8_lane 0, 13
+; MAX-BANDWIDTH: v128.load8_lane 0, 14
+; MAX-BANDWIDTH: v128.load8_lane 0, 15
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load8_splat 0
+; MAX-BANDWIDTH: v128.load8_lane 0, 1
+; MAX-BANDWIDTH: v128.load8_lane 0, 2
+; MAX-BANDWIDTH: v128.load8_lane 0, 3
+; MAX-BANDWIDTH: v128.load8_lane 0, 4
+; MAX-BANDWIDTH: v128.load8_lane 0, 5
+; MAX-BANDWIDTH: v128.load8_lane 0, 6
+; MAX-BANDWIDTH: v128.load8_lane 0, 7
+; MAX-BANDWIDTH: v128.load8_lane 0, 8
+; MAX-BANDWIDTH: v128.load8_lane 0, 9
+; MAX-BANDWIDTH: v128.load8_lane 0, 10
+; MAX-BANDWIDTH: v128.load8_lane 0, 11
+; MAX-BANDWIDTH: v128.load8_lane 0, 12
+; MAX-BANDWIDTH: v128.load8_lane 0, 13
+; MAX-BANDWIDTH: v128.load8_lane 0, 14
+; MAX-BANDWIDTH: v128.load8_lane 0, 15
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: loop
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 3
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 2
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 0
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 3
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 2
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 0
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 3
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 2
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 0
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 3
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 2
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane 0
+; RELAXED-MAX-BANDWIDTH: v128.load8_splat 0
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 1
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 2
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 3
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 4
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 5
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 6
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 7
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 8
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 9
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 10
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 11
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 12
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 13
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 14
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 15
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; RELAXED-MAX-BANDWIDTH: v128.load8_splat 0
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 1
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 2
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 3
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 4
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 5
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 6
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 7
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 8
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 9
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 10
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 11
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 12
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 13
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 14
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 15
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: v128.load8_splat 0
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 1
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 2
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 3
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 4
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 5
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 6
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 7
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 8
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 9
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 10
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 11
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 12
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 13
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 14
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 15
+; RELAXED-MAX-BANDWIDTH: v128.load8_splat 0
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 1
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 2
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 3
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 4
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 5
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 6
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 7
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 8
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 9
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 10
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 11
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 12
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 13
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 14
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane 0, 15
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+define hidden { i32, i32, i32, i32 } @bb2053_inner_loop(ptr nocapture %base0, ptr nocapture %base1, ptr nocapture %weights, ptr nocapture readonly %indices, i32 %len, i32 %stride, i32 %acc0, i32 %acc1, i32 %acc2, i32 %acc3) local_unnamed_addr {
+entry:
+ br label %bb2053.loop
+
+bb2053.loop:
+ %idx = phi i32 [ 0, %entry ], [ %idx.next, %bb2053.loop ]
+ %accA = phi i32 [ %acc0, %entry ], [ %accA.sum, %bb2053.loop ]
+ %accB = phi i32 [ %acc1, %entry ], [ %accB.sum, %bb2053.loop ]
+ %accC = phi i32 [ %acc2, %entry ], [ %accC.sum, %bb2053.loop ]
+ %accD = phi i32 [ %acc3, %entry ], [ %accD.sum, %bb2053.loop ]
+ %wptr = phi ptr [ %weights, %entry ], [ %wptr.next, %bb2053.loop ]
+ %idx.ptr = getelementptr inbounds nuw i32, ptr %indices, i32 %idx
+ %idx.val = load i32, ptr %idx.ptr, align 4
+ %lhs0.ptr = getelementptr inbounds i8, ptr %base0, i32 %idx.val
+ %rhs0.ptr = getelementptr inbounds i8, ptr %base1, i32 %idx.val
+ %lhs0 = load i8, ptr %lhs0.ptr, align 1
+ %lhs0.sext = sext i8 %lhs0 to i32
+ %w0 = load i8, ptr %wptr, align 1
+ %w0.sext = sext i8 %w0 to i32
+ %mul0 = mul nsw i32 %w0.sext, %lhs0.sext
+ %accA.next = add nsw i32 %mul0, %accA
+ %w1.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 1
+ %w1 = load i8, ptr %w1.ptr, align 1
+ %w1.sext = sext i8 %w1 to i32
+ %mul1 = mul nsw i32 %w1.sext, %lhs0.sext
+ %accC.next = add nsw i32 %mul1, %accC
+ %lhs1.ptr = getelementptr inbounds nuw i8, ptr %lhs0.ptr, i32 %stride
+ %lhs1 = load i8, ptr %lhs1.ptr, align 1
+ %lhs1.sext = sext i8 %lhs1 to i32
+ %w2.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 2
+ %w2 = load i8, ptr %w2.ptr, align 1
+ %w2.sext = sext i8 %w2 to i32
+ %mul2 = mul nsw i32 %w2.sext, %lhs1.sext
+ %accA.sum = add nsw i32 %accA.next, %mul2
+ %w3.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 3
+ %w3 = load i8, ptr %w3.ptr, align 1
+ %w3.sext = sext i8 %w3 to i32
+ %mul3 = mul nsw i32 %w3.sext, %lhs1.sext
+ %accC.sum = add nsw i32 %accC.next, %mul3
+ %rhs0 = load i8, ptr %rhs0.ptr, align 1
+ %rhs0.sext = sext i8 %rhs0 to i32
+ %mul4 = mul nsw i32 %rhs0.sext, %w0.sext
+ %accB.next = add nsw i32 %mul4, %accB
+ %mul5 = mul nsw i32 %rhs0.sext, %w1.sext
+ %accD.next = add nsw i32 %mul5, %accD
+ %rhs1.ptr = getelementptr inbounds nuw i8, ptr %rhs0.ptr, i32 %stride
+ %rhs1 = load i8, ptr %rhs1.ptr, align 1
+ %rhs1.sext = sext i8 %rhs1 to i32
+ %mul6 = mul nsw i32 %rhs1.sext, %w2.sext
+ %accB.sum = add nsw i32 %accB.next, %mul6
+ %mul7 = mul nsw i32 %rhs1.sext, %w3.sext
+ %accD.sum = add nsw i32 %accD.next, %mul7
+ %wptr.next = getelementptr inbounds nuw i8, ptr %wptr, i32 4
+ %idx.next = add nuw nsw i32 %idx, 1
+ %exit = icmp eq i32 %idx.next, %len
+ br i1 %exit, label %bb2053.exit, label %bb2053.loop
+
+bb2053.exit:
+ %res0 = insertvalue { i32, i32, i32, i32 } undef, i32 %accA.sum, 0
+ %res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %accB.sum, 1
+ %res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %accC.sum, 2
+ %res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %accD.sum, 3
+ ret { i32, i32, i32, i32 } %res3
+}
More information about the llvm-commits
mailing list