[llvm] [WebAssembly] load_zero to initialise build_vector (PR #100610)
Sam Parker via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 1 07:53:26 PDT 2024
https://github.com/sparker-arm updated https://github.com/llvm/llvm-project/pull/100610
>From 21d900d15deb5340ce8615872966915349559ef7 Mon Sep 17 00:00:00 2001
From: Samuel Parker <sam.parker at arm.com>
Date: Thu, 25 Jul 2024 16:48:53 +0100
Subject: [PATCH] [WebAssembly] load_zero to initialise build_vector
Instead of splatting a single lane, to initialise a build_vector,
lower to scalar_to_vector which can be selected to load_zero.
Also add the patterns for load_zero and load_lane for f32x4 and
f64x2 as well.
---
.../MCTargetDesc/WebAssemblyMCTargetDesc.h | 12 ++--
.../WebAssembly/WebAssemblyISelLowering.cpp | 11 ++-
.../WebAssembly/WebAssemblyInstrSIMD.td | 49 +++++++------
.../CodeGen/WebAssembly/simd-build-vector.ll | 69 +++++++++++++++++++
4 files changed, 111 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 7f1a5f616ed48..eb3087dafed2a 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -177,7 +177,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I32)
WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I64)
WASM_LOAD_STORE(LOAD8_SPLAT)
- WASM_LOAD_STORE(LOAD_LANE_I8x16)
+ WASM_LOAD_STORE(LOAD_LANE_8)
WASM_LOAD_STORE(STORE_LANE_I8x16)
return 0;
WASM_LOAD_STORE(LOAD16_S_I32)
@@ -205,7 +205,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I32)
WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I64)
WASM_LOAD_STORE(LOAD16_SPLAT)
- WASM_LOAD_STORE(LOAD_LANE_I16x8)
+ WASM_LOAD_STORE(LOAD_LANE_16)
WASM_LOAD_STORE(STORE_LANE_I16x8)
WASM_LOAD_STORE(LOAD_F16_F32)
WASM_LOAD_STORE(STORE_F16_F32)
@@ -238,8 +238,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(MEMORY_ATOMIC_NOTIFY)
WASM_LOAD_STORE(MEMORY_ATOMIC_WAIT32)
WASM_LOAD_STORE(LOAD32_SPLAT)
- WASM_LOAD_STORE(LOAD_ZERO_I32x4)
- WASM_LOAD_STORE(LOAD_LANE_I32x4)
+ WASM_LOAD_STORE(LOAD_ZERO_32)
+ WASM_LOAD_STORE(LOAD_LANE_32)
WASM_LOAD_STORE(STORE_LANE_I32x4)
return 2;
WASM_LOAD_STORE(LOAD_I64)
@@ -263,8 +263,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(LOAD_EXTEND_U_I32x4)
WASM_LOAD_STORE(LOAD_EXTEND_S_I64x2)
WASM_LOAD_STORE(LOAD_EXTEND_U_I64x2)
- WASM_LOAD_STORE(LOAD_ZERO_I64x2)
- WASM_LOAD_STORE(LOAD_LANE_I64x2)
+ WASM_LOAD_STORE(LOAD_ZERO_64)
+ WASM_LOAD_STORE(LOAD_LANE_64)
WASM_LOAD_STORE(STORE_LANE_I64x2)
return 3;
WASM_LOAD_STORE(LOAD_V128)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f77076d7244ca..960ef90148095 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2275,8 +2275,15 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
return IsConstant(Lane);
};
} else {
- // Use a splat (which might be selected as a load splat)
- Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
+ size_t DestLaneSize = VecT.getVectorElementType().getFixedSizeInBits();
+ if (NumSplatLanes == 1 && (DestLaneSize == 32 || DestLaneSize == 64)) {
+ // Could be selected to load_zero.
+ assert(SplatValue == Op->getOperand(0));
+ Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecT, SplatValue);
+ } else {
+ // Use a splat (which might be selected as a load splat)
+ Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
+ }
IsLaneConstructed = [&SplatValue](size_t _, const SDValue &Lane) {
return Lane == SplatValue;
};
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 26fe61b1d6051..a1697299ee424 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -273,13 +273,13 @@ defm : LoadPat<vec.vt, loadpat, inst>;
multiclass SIMDLoadZero<Vec vec, bits<32> simdop> {
defvar name = "v128.load"#vec.lane_bits#"_zero";
let mayLoad = 1, UseNamedOperandTable = 1 in {
- defm LOAD_ZERO_#vec#_A32 :
+ defm LOAD_ZERO_#vec.lane_bits#_A32 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
name#"\t$dst, ${off}(${addr})$p2align",
name#"\t$off$p2align", simdop>;
- defm LOAD_ZERO_#vec#_A64 :
+ defm LOAD_ZERO_#vec.lane_bits#_A64 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset64_op:$off, I64:$addr),
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
@@ -293,32 +293,32 @@ defm "" : SIMDLoadZero<I64x2, 0x5d>;
// Use load_zero to load scalars into vectors as well where possible.
// TODO: i16, and i8 scalars
-foreach vec = [I32x4, I64x2] in {
- defvar inst = "LOAD_ZERO_"#vec;
+foreach vec = [I32x4, I64x2, F32x4, F64x2] in {
+ defvar inst = "LOAD_ZERO_"#vec.lane_bits;
defvar pat = PatFrag<(ops node:$addr), (scalar_to_vector (vec.lane_vt (load $addr)))>;
defm : LoadPat<vec.vt, pat, inst>;
}
// TODO: f32x4 and f64x2 as well
foreach vec = [I32x4, I64x2] in {
- defvar inst = "LOAD_ZERO_"#vec;
+ defvar inst = "LOAD_ZERO_"#vec.lane_bits;
defvar pat = PatFrag<(ops node:$ptr),
(vector_insert (vec.splat (vec.lane_vt 0)), (vec.lane_vt (load $ptr)), 0)>;
defm : LoadPat<vec.vt, pat, inst>;
}
// Load lane
-multiclass SIMDLoadLane<Vec vec, bits<32> simdop> {
- defvar name = "v128.load"#vec.lane_bits#"_lane";
+multiclass SIMDLoadLane<bits<32> lane_bits, bits<32> simdop> {
+ defvar name = "v128.load"#lane_bits#"_lane";
let mayLoad = 1, UseNamedOperandTable = 1 in {
- defm LOAD_LANE_#vec#_A32 :
+ defm LOAD_LANE_#lane_bits#_A32 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx,
I32:$addr, V128:$vec),
(outs), (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx),
[], name#"\t$dst, ${off}(${addr})$p2align, $vec, $idx",
name#"\t$off$p2align, $idx", simdop>;
- defm LOAD_LANE_#vec#_A64 :
+ defm LOAD_LANE_#lane_bits#_A64 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx,
I64:$addr, V128:$vec),
@@ -328,15 +328,15 @@ multiclass SIMDLoadLane<Vec vec, bits<32> simdop> {
} // mayLoad = 1, UseNamedOperandTable = 1
}
-defm "" : SIMDLoadLane<I8x16, 0x54>;
-defm "" : SIMDLoadLane<I16x8, 0x55>;
-defm "" : SIMDLoadLane<I32x4, 0x56>;
-defm "" : SIMDLoadLane<I64x2, 0x57>;
+defm "" : SIMDLoadLane<8, 0x54>;
+defm "" : SIMDLoadLane<16, 0x55>;
+defm "" : SIMDLoadLane<32, 0x56>;
+defm "" : SIMDLoadLane<64, 0x57>;
// Select loads with no constant offset.
multiclass LoadLanePatNoOffset<Vec vec, SDPatternOperator kind> {
- defvar load_lane_a32 = !cast<NI>("LOAD_LANE_"#vec#"_A32");
- defvar load_lane_a64 = !cast<NI>("LOAD_LANE_"#vec#"_A64");
+ defvar load_lane_a32 = !cast<NI>("LOAD_LANE_"#vec.lane_bits#"_A32");
+ defvar load_lane_a64 = !cast<NI>("LOAD_LANE_"#vec.lane_bits#"_A64");
def : Pat<(vec.vt (kind (i32 I32:$addr),
(vec.vt V128:$vec), (i32 vec.lane_idx:$idx))),
(load_lane_a32 0, 0, imm:$idx, $addr, $vec)>,
@@ -354,17 +354,22 @@ def load16_lane :
PatFrag<(ops node:$ptr, node:$vec, node:$idx),
(vector_insert $vec, (i32 (extloadi16 $ptr)), $idx)>;
def load32_lane :
- PatFrag<(ops node:$ptr, node:$vec, node:$idx),
- (vector_insert $vec, (i32 (load $ptr)), $idx)>;
+ PatFrags<(ops node:$ptr, node:$vec, node:$idx), [
+ (vector_insert $vec, (i32 (load $ptr)), $idx),
+ (vector_insert $vec, (f32 (load $ptr)), $idx)
+]>;
def load64_lane :
- PatFrag<(ops node:$ptr, node:$vec, node:$idx),
- (vector_insert $vec, (i64 (load $ptr)), $idx)>;
-// TODO: floating point lanes as well
+ PatFrags<(ops node:$ptr, node:$vec, node:$idx), [
+ (vector_insert $vec, (i64 (load $ptr)), $idx),
+ (vector_insert $vec, (f64 (load $ptr)), $idx)
+]>;
defm : LoadLanePatNoOffset<I8x16, load8_lane>;
defm : LoadLanePatNoOffset<I16x8, load16_lane>;
defm : LoadLanePatNoOffset<I32x4, load32_lane>;
defm : LoadLanePatNoOffset<I64x2, load64_lane>;
+defm : LoadLanePatNoOffset<F32x4, load32_lane>;
+defm : LoadLanePatNoOffset<F64x2, load64_lane>;
// TODO: Also support the other load patterns for load_lane once the instructions
// are merged to the proposal.
@@ -1463,10 +1468,10 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
// Adapted from the body of LoadPatNoOffset
// TODO: other addressing patterns
def : Pat<(v2f64 (extloadv2f32 (i32 I32:$addr))),
- (promote_low_F64x2 (LOAD_ZERO_I64x2_A32 0, 0, I32:$addr))>,
+ (promote_low_F64x2 (LOAD_ZERO_64_A32 0, 0, I32:$addr))>,
Requires<[HasAddr32]>;
def : Pat<(v2f64 (extloadv2f32 (i64 I64:$addr))),
- (promote_low_F64x2 (LOAD_ZERO_I64x2_A64 0, 0, I64:$addr))>,
+ (promote_low_F64x2 (LOAD_ZERO_64_A64 0, 0, I64:$addr))>,
Requires<[HasAddr64]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
index a51b358de2e89..7d295f83e8f1f 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
@@ -440,3 +440,72 @@ define <2 x double> @all_undef_f64x2() {
; CHECK-NEXT: return $0
ret <2 x double> undef
}
+
+define <4 x i32> @load_zero_lane_i32x4(ptr %addr.a, ptr %addr.b, ptr %addr.c, ptr %addr.d) {
+; CHECK-LABEL: load_zero_lane_i32x4:
+; CHECK: .functype load_zero_lane_i32x4 (i32, i32, i32, i32) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load32_zero $push0=, 0($0)
+; CHECK-NEXT: v128.load32_lane $push1=, 0($1), $pop0, 1
+; CHECK-NEXT: v128.load32_lane $push2=, 0($2), $pop1, 2
+; CHECK-NEXT: v128.load32_lane $push3=, 0($3), $pop2, 3
+; CHECK-NEXT: return $pop3
+ %a = load i32, ptr %addr.a
+ %b = load i32, ptr %addr.b
+ %c = load i32, ptr %addr.c
+ %d = load i32, ptr %addr.d
+ %v = insertelement <4 x i32> undef, i32 %a, i32 0
+ %v.1 = insertelement <4 x i32> %v, i32 %b, i32 1
+ %v.2 = insertelement <4 x i32> %v.1, i32 %c, i32 2
+ %v.3 = insertelement <4 x i32> %v.2, i32 %d, i32 3
+ ret <4 x i32> %v.3
+}
+
+define <2 x i64> @load_zero_lane_i64x2(ptr %addr.a, ptr %addr.b) {
+; CHECK-LABEL: load_zero_lane_i64x2:
+; CHECK: .functype load_zero_lane_i64x2 (i32, i32) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load64_zero $push0=, 0($0)
+; CHECK-NEXT: v128.load64_lane $push1=, 0($1), $pop0, 1
+; CHECK-NEXT: return $pop1
+ %a = load i64, ptr %addr.a
+ %b = load i64, ptr %addr.b
+ %v = insertelement <2 x i64> undef, i64 %a, i32 0
+ %v.1 = insertelement <2 x i64> %v, i64 %b, i32 1
+ ret <2 x i64> %v.1
+}
+
+define <4 x float> @load_zero_lane_f32x4(ptr %addr.a, ptr %addr.b, ptr %addr.c, ptr %addr.d) {
+; CHECK-LABEL: load_zero_lane_f32x4:
+; CHECK: .functype load_zero_lane_f32x4 (i32, i32, i32, i32) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load32_zero $push0=, 0($0)
+; CHECK-NEXT: v128.load32_lane $push1=, 0($1), $pop0, 1
+; CHECK-NEXT: v128.load32_lane $push2=, 0($2), $pop1, 2
+; CHECK-NEXT: v128.load32_lane $push3=, 0($3), $pop2, 3
+; CHECK-NEXT: return $pop3
+ %a = load float, ptr %addr.a
+ %b = load float, ptr %addr.b
+ %c = load float, ptr %addr.c
+ %d = load float, ptr %addr.d
+ %v = insertelement <4 x float> undef, float %a, i32 0
+ %v.1 = insertelement <4 x float> %v, float %b, i32 1
+ %v.2 = insertelement <4 x float> %v.1, float %c, i32 2
+ %v.3 = insertelement <4 x float> %v.2, float %d, i32 3
+ ret <4 x float> %v.3
+}
+
+define <2 x double> @load_zero_lane_f64x2(ptr %addr.a, ptr %addr.b) {
+; CHECK-LABEL: load_zero_lane_f64x2:
+; CHECK: .functype load_zero_lane_f64x2 (i32, i32) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load64_zero $push0=, 0($0)
+; CHECK-NEXT: v128.load64_lane $push1=, 0($1), $pop0, 1
+; CHECK-NEXT: return $pop1
+ %a = load double, ptr %addr.a
+ %b = load double, ptr %addr.b
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %v.1 = insertelement <2 x double> %v, double %b, i32 1
+ ret <2 x double> %v.1
+}
+
More information about the llvm-commits
mailing list