[llvm] [NFC][WebAssembly] CodeGen test (PR #189626)

Sam Parker via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 31 04:39:00 PDT 2026


https://github.com/sparker-arm created https://github.com/llvm/llvm-project/pull/189626

None

>From 20975eb17177910a943685a5338dfa7c5bffaa6d Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker at arm.com>
Date: Tue, 31 Mar 2026 12:36:29 +0100
Subject: [PATCH] [NFC][WebAssembly] CodeGen test

---
 .../CodeGen/WebAssembly/strided-int-mac.ll    | 379 ++++++++++++++++++
 1 file changed, 379 insertions(+)
 create mode 100644 llvm/test/CodeGen/WebAssembly/strided-int-mac.ll

diff --git a/llvm/test/CodeGen/WebAssembly/strided-int-mac.ll b/llvm/test/CodeGen/WebAssembly/strided-int-mac.ll
new file mode 100644
index 0000000000000..a283c617a3cfa
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/strided-int-mac.ll
@@ -0,0 +1,379 @@
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
+
+target triple = "wasm32"
+
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i32x4.extract_lane	3
+; CHECK: i32x4.extract_lane	2
+; CHECK: i32x4.extract_lane	1
+; CHECK: i32x4.extract_lane	0
+; CHECK: v128.load8_splat	0
+; CHECK: v128.load8_lane	0, 1
+; CHECK: v128.load8_lane	0, 2
+; CHECK: v128.load8_lane	0, 3
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle	1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: v128.load8_splat	0
+; CHECK: v128.load8_lane	0, 1
+; CHECK: v128.load8_lane	0, 2
+; CHECK: v128.load8_lane	0, 3
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i8x16.shuffle	3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle	0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle	2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: v128.load8_splat	0
+; CHECK: v128.load8_lane	0, 1
+; CHECK: v128.load8_lane	0, 2
+; CHECK: v128.load8_lane	0, 3
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: v128.load8_splat	0
+; CHECK: v128.load8_lane	0, 1
+; CHECK: v128.load8_lane	0, 2
+; CHECK: v128.load8_lane	0, 3
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+; CHECK: i32x4.extmul_low_i16x8_s
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extract_lane	3
+; MAX-BANDWIDTH: i32x4.extract_lane	2
+; MAX-BANDWIDTH: i32x4.extract_lane	1
+; MAX-BANDWIDTH: i32x4.extract_lane	0
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extract_lane	3
+; MAX-BANDWIDTH: i32x4.extract_lane	2
+; MAX-BANDWIDTH: i32x4.extract_lane	1
+; MAX-BANDWIDTH: i32x4.extract_lane	0
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extract_lane	3
+; MAX-BANDWIDTH: i32x4.extract_lane	2
+; MAX-BANDWIDTH: i32x4.extract_lane	1
+; MAX-BANDWIDTH: i32x4.extract_lane	0
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extract_lane	3
+; MAX-BANDWIDTH: i32x4.extract_lane	2
+; MAX-BANDWIDTH: i32x4.extract_lane	1
+; MAX-BANDWIDTH: i32x4.extract_lane	0
+; MAX-BANDWIDTH: v128.load8_splat	0
+; MAX-BANDWIDTH: v128.load8_lane	0, 1
+; MAX-BANDWIDTH: v128.load8_lane	0, 2
+; MAX-BANDWIDTH: v128.load8_lane	0, 3
+; MAX-BANDWIDTH: v128.load8_lane	0, 4
+; MAX-BANDWIDTH: v128.load8_lane	0, 5
+; MAX-BANDWIDTH: v128.load8_lane	0, 6
+; MAX-BANDWIDTH: v128.load8_lane	0, 7
+; MAX-BANDWIDTH: v128.load8_lane	0, 8
+; MAX-BANDWIDTH: v128.load8_lane	0, 9
+; MAX-BANDWIDTH: v128.load8_lane	0, 10
+; MAX-BANDWIDTH: v128.load8_lane	0, 11
+; MAX-BANDWIDTH: v128.load8_lane	0, 12
+; MAX-BANDWIDTH: v128.load8_lane	0, 13
+; MAX-BANDWIDTH: v128.load8_lane	0, 14
+; MAX-BANDWIDTH: v128.load8_lane	0, 15
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle	3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31
+; MAX-BANDWIDTH: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load8_splat	0
+; MAX-BANDWIDTH: v128.load8_lane	0, 1
+; MAX-BANDWIDTH: v128.load8_lane	0, 2
+; MAX-BANDWIDTH: v128.load8_lane	0, 3
+; MAX-BANDWIDTH: v128.load8_lane	0, 4
+; MAX-BANDWIDTH: v128.load8_lane	0, 5
+; MAX-BANDWIDTH: v128.load8_lane	0, 6
+; MAX-BANDWIDTH: v128.load8_lane	0, 7
+; MAX-BANDWIDTH: v128.load8_lane	0, 8
+; MAX-BANDWIDTH: v128.load8_lane	0, 9
+; MAX-BANDWIDTH: v128.load8_lane	0, 10
+; MAX-BANDWIDTH: v128.load8_lane	0, 11
+; MAX-BANDWIDTH: v128.load8_lane	0, 12
+; MAX-BANDWIDTH: v128.load8_lane	0, 13
+; MAX-BANDWIDTH: v128.load8_lane	0, 14
+; MAX-BANDWIDTH: v128.load8_lane	0, 15
+; MAX-BANDWIDTH: i8x16.shuffle	1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29
+; MAX-BANDWIDTH: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle	2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30
+; MAX-BANDWIDTH: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle	0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28
+; MAX-BANDWIDTH: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load8_splat	0
+; MAX-BANDWIDTH: v128.load8_lane	0, 1
+; MAX-BANDWIDTH: v128.load8_lane	0, 2
+; MAX-BANDWIDTH: v128.load8_lane	0, 3
+; MAX-BANDWIDTH: v128.load8_lane	0, 4
+; MAX-BANDWIDTH: v128.load8_lane	0, 5
+; MAX-BANDWIDTH: v128.load8_lane	0, 6
+; MAX-BANDWIDTH: v128.load8_lane	0, 7
+; MAX-BANDWIDTH: v128.load8_lane	0, 8
+; MAX-BANDWIDTH: v128.load8_lane	0, 9
+; MAX-BANDWIDTH: v128.load8_lane	0, 10
+; MAX-BANDWIDTH: v128.load8_lane	0, 11
+; MAX-BANDWIDTH: v128.load8_lane	0, 12
+; MAX-BANDWIDTH: v128.load8_lane	0, 13
+; MAX-BANDWIDTH: v128.load8_lane	0, 14
+; MAX-BANDWIDTH: v128.load8_lane	0, 15
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: v128.load8_splat	0
+; MAX-BANDWIDTH: v128.load8_lane	0, 1
+; MAX-BANDWIDTH: v128.load8_lane	0, 2
+; MAX-BANDWIDTH: v128.load8_lane	0, 3
+; MAX-BANDWIDTH: v128.load8_lane	0, 4
+; MAX-BANDWIDTH: v128.load8_lane	0, 5
+; MAX-BANDWIDTH: v128.load8_lane	0, 6
+; MAX-BANDWIDTH: v128.load8_lane	0, 7
+; MAX-BANDWIDTH: v128.load8_lane	0, 8
+; MAX-BANDWIDTH: v128.load8_lane	0, 9
+; MAX-BANDWIDTH: v128.load8_lane	0, 10
+; MAX-BANDWIDTH: v128.load8_lane	0, 11
+; MAX-BANDWIDTH: v128.load8_lane	0, 12
+; MAX-BANDWIDTH: v128.load8_lane	0, 13
+; MAX-BANDWIDTH: v128.load8_lane	0, 14
+; MAX-BANDWIDTH: v128.load8_lane	0, 15
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: loop
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	3
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	2
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	1
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	0
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	3
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	2
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	1
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	0
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	3
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	2
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	1
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	0
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	3
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	2
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	1
+; RELAXED-MAX-BANDWIDTH: i32x4.extract_lane	0
+; RELAXED-MAX-BANDWIDTH: v128.load8_splat	0
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 1
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 2
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 3
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 4
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 5
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 6
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 7
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 8
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 9
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 10
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 11
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 12
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 13
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 14
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 15
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 3, 7, 11, 15, 19, 23, 27, 31
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; RELAXED-MAX-BANDWIDTH: v128.load8_splat	0
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 1
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 2
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 3
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 4
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 5
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 6
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 7
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 8
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 9
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 10
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 11
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 12
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 13
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 14
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 15
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 9, 13, 17, 21, 25, 29
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 10, 14, 18, 22, 26, 30
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 8, 12, 16, 20, 24, 28
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: v128.load8_splat	0
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 1
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 2
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 3
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 4
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 5
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 6
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 7
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 8
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 9
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 10
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 11
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 12
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 13
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 14
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 15
+; RELAXED-MAX-BANDWIDTH: v128.load8_splat	0
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 1
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 2
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 3
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 4
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 5
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 6
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 7
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 8
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 9
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 10
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 11
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 12
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 13
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 14
+; RELAXED-MAX-BANDWIDTH: v128.load8_lane	0, 15
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+define hidden { i32, i32, i32, i32 } @bb2053_inner_loop(ptr nocapture %base0, ptr nocapture %base1, ptr nocapture %weights, ptr nocapture readonly %indices, i32 %len, i32 %stride, i32 %acc0, i32 %acc1, i32 %acc2, i32 %acc3) local_unnamed_addr {
+entry:
+  br label %bb2053.loop
+
+bb2053.loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %bb2053.loop ]
+  %accA = phi i32 [ %acc0, %entry ], [ %accA.sum, %bb2053.loop ]
+  %accB = phi i32 [ %acc1, %entry ], [ %accB.sum, %bb2053.loop ]
+  %accC = phi i32 [ %acc2, %entry ], [ %accC.sum, %bb2053.loop ]
+  %accD = phi i32 [ %acc3, %entry ], [ %accD.sum, %bb2053.loop ]
+  %wptr = phi ptr [ %weights, %entry ], [ %wptr.next, %bb2053.loop ]
+  %idx.ptr = getelementptr inbounds nuw i32, ptr %indices, i32 %idx
+  %idx.val = load i32, ptr %idx.ptr, align 4
+  %lhs0.ptr = getelementptr inbounds i8, ptr %base0, i32 %idx.val
+  %rhs0.ptr = getelementptr inbounds i8, ptr %base1, i32 %idx.val
+  %lhs0 = load i8, ptr %lhs0.ptr, align 1
+  %lhs0.sext = sext i8 %lhs0 to i32
+  %w0 = load i8, ptr %wptr, align 1
+  %w0.sext = sext i8 %w0 to i32
+  %mul0 = mul nsw i32 %w0.sext, %lhs0.sext
+  %accA.next = add nsw i32 %mul0, %accA
+  %w1.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 1
+  %w1 = load i8, ptr %w1.ptr, align 1
+  %w1.sext = sext i8 %w1 to i32
+  %mul1 = mul nsw i32 %w1.sext, %lhs0.sext
+  %accC.next = add nsw i32 %mul1, %accC
+  %lhs1.ptr = getelementptr inbounds nuw i8, ptr %lhs0.ptr, i32 %stride
+  %lhs1 = load i8, ptr %lhs1.ptr, align 1
+  %lhs1.sext = sext i8 %lhs1 to i32
+  %w2.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 2
+  %w2 = load i8, ptr %w2.ptr, align 1
+  %w2.sext = sext i8 %w2 to i32
+  %mul2 = mul nsw i32 %w2.sext, %lhs1.sext
+  %accA.sum = add nsw i32 %accA.next, %mul2
+  %w3.ptr = getelementptr inbounds nuw i8, ptr %wptr, i32 3
+  %w3 = load i8, ptr %w3.ptr, align 1
+  %w3.sext = sext i8 %w3 to i32
+  %mul3 = mul nsw i32 %w3.sext, %lhs1.sext
+  %accC.sum = add nsw i32 %accC.next, %mul3
+  %rhs0 = load i8, ptr %rhs0.ptr, align 1
+  %rhs0.sext = sext i8 %rhs0 to i32
+  %mul4 = mul nsw i32 %rhs0.sext, %w0.sext
+  %accB.next = add nsw i32 %mul4, %accB
+  %mul5 = mul nsw i32 %rhs0.sext, %w1.sext
+  %accD.next = add nsw i32 %mul5, %accD
+  %rhs1.ptr = getelementptr inbounds nuw i8, ptr %rhs0.ptr, i32 %stride
+  %rhs1 = load i8, ptr %rhs1.ptr, align 1
+  %rhs1.sext = sext i8 %rhs1 to i32
+  %mul6 = mul nsw i32 %rhs1.sext, %w2.sext
+  %accB.sum = add nsw i32 %accB.next, %mul6
+  %mul7 = mul nsw i32 %rhs1.sext, %w3.sext
+  %accD.sum = add nsw i32 %accD.next, %mul7
+  %wptr.next = getelementptr inbounds nuw i8, ptr %wptr, i32 4
+  %idx.next = add nuw nsw i32 %idx, 1
+  %exit = icmp eq i32 %idx.next, %len
+  br i1 %exit, label %bb2053.exit, label %bb2053.loop
+
+bb2053.exit:
+  %res0 = insertvalue { i32, i32, i32, i32 } undef, i32 %accA.sum, 0
+  %res1 = insertvalue { i32, i32, i32, i32 } %res0, i32 %accB.sum, 1
+  %res2 = insertvalue { i32, i32, i32, i32 } %res1, i32 %accC.sum, 2
+  %res3 = insertvalue { i32, i32, i32, i32 } %res2, i32 %accD.sum, 3
+  ret { i32, i32, i32, i32 } %res3
+}



More information about the llvm-commits mailing list