[clang] [llvm] [WebAssembly] Implement prototype f32.load_f16 instruction. (PR #90906)
Brendan Dahl via cfe-commits
cfe-commits at lists.llvm.org
Mon May 6 13:47:04 PDT 2024
https://github.com/brendandahl updated https://github.com/llvm/llvm-project/pull/90906
>From 14313fa9ef33b4cbc8cf18f280ee885b38015ca4 Mon Sep 17 00:00:00 2001
From: Brendan Dahl <brendan.dahl at gmail.com>
Date: Wed, 1 May 2024 21:53:39 +0000
Subject: [PATCH 1/3] [WebAssembly] Implement prototype f32.load_f16
instruction.
Adds a builtin and intrinsic for the f32.load_f16 instruction.
The instruction loads an f16 value from memory and puts it in an f32.
Specified at:
https://github.com/WebAssembly/half-precision/blob/29a9b9462c9285d4ccc1a5dc39214ddfd1892658/proposals/half-precision/Overview.md
Note: the current spec has f32.load_f16 as opcode 0xFD0120, but this is incorrect
and will be changed to 0xFC30 soon.
---
clang/include/clang/Basic/BuiltinsWebAssembly.def | 3 +++
clang/lib/CodeGen/CGBuiltin.cpp | 5 +++++
clang/test/CodeGen/builtins-wasm.c | 9 +++++++--
llvm/include/llvm/IR/IntrinsicsWebAssembly.td | 12 ++++++++++++
.../MCTargetDesc/WebAssemblyMCTargetDesc.h | 1 +
.../Target/WebAssembly/WebAssemblyISelLowering.cpp | 8 ++++++++
.../lib/Target/WebAssembly/WebAssemblyInstrMemory.td | 5 +++++
llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 7 +++++++
llvm/test/CodeGen/WebAssembly/half-precision.ll | 12 ++++++++++++
llvm/test/MC/WebAssembly/simd-encodings.s | 5 ++++-
10 files changed, 64 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/WebAssembly/half-precision.ll
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 7e950914ad946d..8b0a1d4579d84c 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -190,6 +190,9 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_dot_i8x16_i7x16_s_i16x8, "V8sV16ScV16Sc",
TARGET_BUILTIN(__builtin_wasm_relaxed_dot_i8x16_i7x16_add_s_i32x4, "V4iV16ScV16ScV4i", "nc", "relaxed-simd")
TARGET_BUILTIN(__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4, "V4fV8UsV8UsV4f", "nc", "relaxed-simd")
+// Half-Precision (fp16)
+TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fi*", "nU", "half-precision")
+
// Reference Types builtins
// Some builtins are custom type-checked - see 't' as part of the third argument,
// in which case the argument spec (second argument) is unused.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a370734e00d3e1..e9d465bd2a6b01 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -21257,6 +21257,11 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_bf16x8_add_f32);
return Builder.CreateCall(Callee, {LHS, RHS, Acc});
}
+ case WebAssembly::BI__builtin_wasm_loadf16_f32: {
+ Value *Addr = EmitScalarExpr(E->getArg(0));
+ Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_loadf16_f32);
+ return Builder.CreateCall(Callee, {Addr});
+ }
case WebAssembly::BI__builtin_wasm_table_get: {
assert(E->getArg(0)->getType()->isArrayType());
Value *Table = EmitArrayToPointerDecay(E->getArg(0)).emitRawPointer(*this);
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 9a323da9a8e846..a845d5429039d4 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple wasm32-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY32
-// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY64
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -target-feature +half-precision -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY32
+// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -target-feature +half-precision -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY64
// RUN: not %clang_cc1 -triple wasm64-unknown-unknown -target-feature +reference-types -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s 2>&1 | FileCheck %s -check-prefixes MISSING-SIMD
// SIMD convenience types
@@ -802,6 +802,11 @@ f32x4 relaxed_dot_bf16x8_add_f32_f32x4(u16x8 a, u16x8 b, f32x4 c) {
// WEBASSEMBLY-NEXT: ret
}
+float load_f16_f32(int *addr) {
+ return __builtin_wasm_loadf16_f32(addr);
+ // WEBASSEMBLY: call float @llvm.wasm.loadf16.f32(ptr %{{.*}})
+}
+
__externref_t externref_null() {
return __builtin_wasm_ref_null_extern();
// WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern()
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index b93a5e7be1b51e..f8142a8ca9e934 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -321,6 +321,18 @@ def int_wasm_relaxed_dot_bf16x8_add_f32:
[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4f32_ty],
[IntrNoMem, IntrSpeculatable]>;
+//===----------------------------------------------------------------------===//
+// Half-precision intrinsics (experimental)
+//===----------------------------------------------------------------------===//
+
+// TODO: Replace these intrinsic with normal ISel patterns once the XXX
+// instructions are merged to the proposal.
+def int_wasm_loadf16_f32:
+ Intrinsic<[llvm_float_ty],
+ [llvm_ptr_ty],
+ [IntrReadMem, IntrArgMemOnly],
+ "", [SDNPMemOperand]>;
+
//===----------------------------------------------------------------------===//
// Thread-local storage intrinsics
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 15aeaaeb8c4a4e..d3b496ae591798 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -206,6 +206,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(LOAD16_SPLAT)
WASM_LOAD_STORE(LOAD_LANE_I16x8)
WASM_LOAD_STORE(STORE_LANE_I16x8)
+ WASM_LOAD_STORE(LOAD_F16_F32)
return 1;
WASM_LOAD_STORE(LOAD_I32)
WASM_LOAD_STORE(LOAD_F32)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 64bcadf3f5677c..ed52fe53bc6096 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -906,6 +906,14 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align = Align(8);
Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
return true;
+ case Intrinsic::wasm_loadf16_f32:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::f16;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = Align(2);
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
default:
return false;
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 01c0909af72e5e..e4baf842462a96 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -72,6 +72,9 @@ defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33, []>;
defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34, []>;
defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35, []>;
+// Half Precision
+defm LOAD_F16_F32 : WebAssemblyLoad<F32, "f32.load_f16", 0xfc30, [HasHalfPrecision]>;
+
// Pattern matching
multiclass LoadPat<ValueType ty, SDPatternOperator kind, string Name> {
@@ -111,6 +114,8 @@ defm : LoadPat<i64, extloadi8, "LOAD8_U_I64">;
defm : LoadPat<i64, extloadi16, "LOAD16_U_I64">;
defm : LoadPat<i64, extloadi32, "LOAD32_U_I64">;
+defm : LoadPat<f32, int_wasm_loadf16_f32, "LOAD_F16_F32">;
+
// Defines atomic and non-atomic stores, regular and truncating
multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode,
list<Predicate> reqs = []> {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index af95dfa25a1893..ccf72192eb11c5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -38,6 +38,13 @@ multiclass RELAXED_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
asmstr_s, simdop, HasRelaxedSIMD>;
}
+multiclass HALF_PRECISION_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+ list<dag> pattern_r, string asmstr_r = "",
+ string asmstr_s = "", bits<32> simdop = -1> {
+ defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
+ asmstr_s, simdop, HasHalfPrecision>;
+}
+
defm "" : ARGUMENT<V128, v16i8>;
defm "" : ARGUMENT<V128, v8i16>;
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
new file mode 100644
index 00000000000000..582771d3f95fcb
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision | FileCheck %s
+; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision | FileCheck %s
+
+declare float @llvm.wasm.loadf32.f16(ptr)
+
+; CHECK-LABEL: ldf16_32:
+; CHECK: f32.load_f16 $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM0]]{{$}}
+define float @ldf16_32(ptr %p) {
+ %v = call float @llvm.wasm.loadf16.f32(ptr %p)
+ ret float %v
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index c6c554990c2c4f..e7c3761f381d03 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -no-type-check -show-encoding -triple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd < %s | FileCheck %s
+# RUN: llvm-mc -no-type-check -show-encoding -triple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd,+half-precision < %s | FileCheck %s
main:
.functype main () -> ()
@@ -839,4 +839,7 @@ main:
# CHECK: i32x4.relaxed_dot_i8x16_i7x16_add_s # encoding: [0xfd,0x93,0x02]
i32x4.relaxed_dot_i8x16_i7x16_add_s
+ # CHECK: f32.load_f16 48 # encoding: [0xfc,0x30,0x01,0x30]
+ f32.load_f16 48
+
end_function
>From aa8b37ad5f0ecce01be0fcb7bf13695a08146867 Mon Sep 17 00:00:00 2001
From: Brendan Dahl <brendan.dahl at gmail.com>
Date: Thu, 2 May 2024 22:59:17 +0000
Subject: [PATCH 2/3] Use short for builtin. Alternatively we could use
_Float16, but then need to enable that for the platform.
---
clang/include/clang/Basic/BuiltinsWebAssembly.def | 2 +-
clang/test/CodeGen/builtins-wasm.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 8b0a1d4579d84c..7b518ca471a755 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -191,7 +191,7 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_dot_i8x16_i7x16_add_s_i32x4, "V4iV16ScV16S
TARGET_BUILTIN(__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4, "V4fV8UsV8UsV4f", "nc", "relaxed-simd")
// Half-Precision (fp16)
-TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fi*", "nU", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fs*", "nU", "half-precision")
// Reference Types builtins
// Some builtins are custom type-checked - see 't' as part of the third argument,
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index a845d5429039d4..b67d179756954d 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -802,7 +802,7 @@ f32x4 relaxed_dot_bf16x8_add_f32_f32x4(u16x8 a, u16x8 b, f32x4 c) {
// WEBASSEMBLY-NEXT: ret
}
-float load_f16_f32(int *addr) {
+float load_f16_f32(short *addr) {
return __builtin_wasm_loadf16_f32(addr);
// WEBASSEMBLY: call float @llvm.wasm.loadf16.f32(ptr %{{.*}})
}
>From e3e90b509d42333a2efc1533ff2ddb8a0cbc1c42 Mon Sep 17 00:00:00 2001
From: Brendan Dahl <brendan.dahl at gmail.com>
Date: Mon, 6 May 2024 20:46:45 +0000
Subject: [PATCH 3/3] Review comments.
---
.../clang/Basic/BuiltinsWebAssembly.def | 2 +-
clang/test/CodeGen/builtins-wasm.c | 2 +-
.../WebAssembly/WebAssemblyInstrSIMD.td | 7 -----
llvm/test/CodeGen/WebAssembly/offset.ll | 28 ++++++++++++++++++-
4 files changed, 29 insertions(+), 10 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 7b518ca471a755..cf54f8f4422f88 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -191,7 +191,7 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_dot_i8x16_i7x16_add_s_i32x4, "V4iV16ScV16S
TARGET_BUILTIN(__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4, "V4fV8UsV8UsV4f", "nc", "relaxed-simd")
// Half-Precision (fp16)
-TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fs*", "nU", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "half-precision")
// Reference Types builtins
// Some builtins are custom type-checked - see 't' as part of the third argument,
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index b67d179756954d..ab1c6cd494ae57 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -802,7 +802,7 @@ f32x4 relaxed_dot_bf16x8_add_f32_f32x4(u16x8 a, u16x8 b, f32x4 c) {
// WEBASSEMBLY-NEXT: ret
}
-float load_f16_f32(short *addr) {
+float load_f16_f32(__fp16 *addr) {
return __builtin_wasm_loadf16_f32(addr);
// WEBASSEMBLY: call float @llvm.wasm.loadf16.f32(ptr %{{.*}})
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index ccf72192eb11c5..af95dfa25a1893 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -38,13 +38,6 @@ multiclass RELAXED_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
asmstr_s, simdop, HasRelaxedSIMD>;
}
-multiclass HALF_PRECISION_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
- list<dag> pattern_r, string asmstr_r = "",
- string asmstr_s = "", bits<32> simdop = -1> {
- defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
- asmstr_s, simdop, HasHalfPrecision>;
-}
-
defm "" : ARGUMENT<V128, v16i8>;
defm "" : ARGUMENT<V128, v8i16>;
diff --git a/llvm/test/CodeGen/WebAssembly/offset.ll b/llvm/test/CodeGen/WebAssembly/offset.ll
index 0d9fcf05ab1bf8..b497ddd7273a0c 100644
--- a/llvm/test/CodeGen/WebAssembly/offset.ll
+++ b/llvm/test/CodeGen/WebAssembly/offset.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -disable-wasm-fallthrough-return-opt -mattr=+half-precision | FileCheck %s
; Test constant load and store address offsets.
@@ -666,3 +666,29 @@ define {i32,i32,i32,i32} @aggregate_return() {
define {i64,i32,i16,i8} @aggregate_return_without_merge() {
ret {i64,i32,i16,i8} zeroinitializer
}
+
+;===----------------------------------------------------------------------------
+; Loads: Half Precision
+;===----------------------------------------------------------------------------
+
+; Fold an offset into a zero-extending load.
+
+; CHECK-LABEL: load_f16_f32_with_folded_offset:
+; CHECK: f32.load_f16 $push0=, 24($0){{$}}
+define float @load_f16_f32_with_folded_offset(ptr %p) {
+ %q = ptrtoint ptr %p to i32
+ %r = add nuw i32 %q, 24
+ %s = inttoptr i32 %r to ptr
+ %t = call float @llvm.wasm.loadf16.f32(ptr %s)
+ ret float %t
+}
+
+; Fold a gep offset into a zero-extending load.
+
+; CHECK-LABEL: load_f16_f32_with_folded_gep_offset:
+; CHECK: f32.load_f16 $push0=, 24($0){{$}}
+define float @load_f16_f32_with_folded_gep_offset(ptr %p) {
+ %s = getelementptr inbounds i8, ptr %p, i32 24
+ %t = call float @llvm.wasm.loadf16.f32(ptr %s)
+ ret float %t
+}
More information about the cfe-commits
mailing list