[PATCH] [AArch64]Fix a bug about vset_lane_f16/vget_lane_f16 intrinsics caused by casting from float16_t to int16_t
Hao Liu
Hao.Liu at arm.com
Thu Jan 23 02:25:03 PST 2014
Hi t.p.northover,
Hi Tim and reviewers,
Currently, vset_lane_f16/vget_lane_f16 are implemented by macro like:
#define vset_lane_f16(a, b, __c) ( { \
float16_t __a = (a); float16x4_t __b = (b); \
int16_t __a1 = (int16_t) __a; \
...
There is a cast from float16_t to int16_t (from variable __a to __a1). Obviously, this well change the semantic. We just want bitwise convert, not type cast from float to int. So if we set 3.5 to a vector lane and get from that lane, we'll get incorrect result: 0.
This patch fixes this problem by implementing such intrinsics by builtin functions. The scalar type float16_t is passed directly without cast.
Review, please.
Thanks,
-Hao
http://llvm-reviews.chandlerc.com/D2602
Files:
include/clang/Basic/arm_neon.td
lib/CodeGen/CGBuiltin.cpp
test/CodeGen/aarch64-neon-copy.c
utils/TableGen/NeonEmitter.cpp
Index: include/clang/Basic/arm_neon.td
===================================================================
--- include/clang/Basic/arm_neon.td
+++ include/clang/Basic/arm_neon.td
@@ -866,9 +866,9 @@
////////////////////////////////////////////////////////////////////////////////
// Extract or insert element from vector
def GET_LANE : IInst<"vget_lane", "sdi",
- "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPsfdQfQdPlQPl">;
+ "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPshfdQhQfQdPlQPl">;
def SET_LANE : IInst<"vset_lane", "dsdi",
- "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPsfdQfQdPlQPl">;
+ "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPshfdQhQfQdPlQPl">;
def COPY_LANE : IOpInst<"vcopy_lane", "ddidi",
"csilPcPsUcUsUiUlPcPsPlfd", OP_COPY_LN>;
def COPYQ_LANE : IOpInst<"vcopy_lane", "ddigi",
@@ -1340,7 +1340,4 @@
def SCALAR_VDUP_LANE : IInst<"vdup_lane", "sdi", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "sji", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
-
-def SCALAR_GET_LANE : IOpInst<"vget_lane", "sdi", "hQh", OP_SCALAR_GET_LN>;
-def SCALAR_SET_LANE : IOpInst<"vset_lane", "dsdi", "hQh", OP_SCALAR_SET_LN>;
}
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -1876,12 +1876,14 @@
case AArch64::BI__builtin_neon_vget_lane_i16:
case AArch64::BI__builtin_neon_vget_lane_i32:
case AArch64::BI__builtin_neon_vget_lane_i64:
+ case AArch64::BI__builtin_neon_vget_lane_f16:
case AArch64::BI__builtin_neon_vget_lane_f32:
case AArch64::BI__builtin_neon_vget_lane_f64:
case AArch64::BI__builtin_neon_vgetq_lane_i8:
case AArch64::BI__builtin_neon_vgetq_lane_i16:
case AArch64::BI__builtin_neon_vgetq_lane_i32:
case AArch64::BI__builtin_neon_vgetq_lane_i64:
+ case AArch64::BI__builtin_neon_vgetq_lane_f16:
case AArch64::BI__builtin_neon_vgetq_lane_f32:
case AArch64::BI__builtin_neon_vgetq_lane_f64:
return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vget_lane_i8, E);
@@ -1889,12 +1891,14 @@
case AArch64::BI__builtin_neon_vset_lane_i16:
case AArch64::BI__builtin_neon_vset_lane_i32:
case AArch64::BI__builtin_neon_vset_lane_i64:
+ case AArch64::BI__builtin_neon_vset_lane_f16:
case AArch64::BI__builtin_neon_vset_lane_f32:
case AArch64::BI__builtin_neon_vset_lane_f64:
case AArch64::BI__builtin_neon_vsetq_lane_i8:
case AArch64::BI__builtin_neon_vsetq_lane_i16:
case AArch64::BI__builtin_neon_vsetq_lane_i32:
case AArch64::BI__builtin_neon_vsetq_lane_i64:
+ case AArch64::BI__builtin_neon_vsetq_lane_f16:
case AArch64::BI__builtin_neon_vsetq_lane_f32:
case AArch64::BI__builtin_neon_vsetq_lane_f64:
return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vset_lane_i8, E);
Index: test/CodeGen/aarch64-neon-copy.c
===================================================================
--- test/CodeGen/aarch64-neon-copy.c
+++ test/CodeGen/aarch64-neon-copy.c
@@ -1274,18 +1274,16 @@
// CHECK: test_vset_lane_f16
float16x4_t test_vset_lane_f16(float16x4_t v1) {
- float16_t a;
+ float16_t a = 0;
return vset_lane_f16(a, v1, 3);
-// CHECK: fmov {{s[0-9]+}}, wzr
-// CHECK-NEXT: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[0]
+// CHECK: ins {{v[0-9]+}}.h[3], wzr
}
// CHECK: test_vsetq_lane_f16
float16x8_t test_vsetq_lane_f16(float16x8_t v1) {
- float16_t a;
+ float16_t a = 0;
return vsetq_lane_f16(a, v1, 7);
-// CHECK: fmov {{s[0-9]+}}, wzr
-// CHECK-NEXT: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[0]
+// CHECK: ins {{v[0-9]+}}.h[7], wzr
}
// CHECK: test_vset_lane_f16_2
@@ -1365,3 +1363,11 @@
// CHECK-NEXT: ret
}
+// CHECK-LABEL: test_vset_get_lane_f16:
+int test_vset_get_lane_f16(float16x8_t a) {
+ float16x8_t b;
+ b = vsetq_lane_f16(3.5, a, 5);
+ float16_t c = vgetq_lane_f16(b, 5);
+ return (int)c;
+// CHECK: movz x{{[0-9]+}}, #3
+}
Index: utils/TableGen/NeonEmitter.cpp
===================================================================
--- utils/TableGen/NeonEmitter.cpp
+++ utils/TableGen/NeonEmitter.cpp
@@ -760,7 +760,7 @@
type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);
usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
- scal && type != 'f' && type != 'd');
+ scal && type != 'h' &&type != 'f' && type != 'd');
// All pointers are void* pointers. Change type to 'v' now.
if (pntr) {
@@ -768,11 +768,6 @@
poly = false;
type = 'v';
}
- // Treat half-float ('h') types as unsigned short ('s') types.
- if (type == 'h') {
- type = 's';
- usgn = true;
- }
if (scal) {
SmallString<128> s;
@@ -796,6 +791,12 @@
return s.str();
}
+ // Treat half-float ('h') types as unsigned short ('s') types.
+ if (type == 'h') {
+ type = 's';
+ usgn = true;
+ }
+
// Since the return value must be one type, return a vector type of the
// appropriate width which we will bitcast. An exception is made for
// returning structs of 2, 3, or 4 vectors which are returned in a sret-like
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D2602.1.patch
Type: text/x-patch
Size: 5174 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20140123/259f12db/attachment.bin>
More information about the cfe-commits
mailing list