[PATCH] [AArch64]Fix a bug about vset_lane_f16/vget_lane_f16 intrinsics caused by casting from float16_t to int16_t

Thu Jan 23 14:38:18 PST 2014

Hao,

What is the failing test case?

Isn't float16_t just a storage type? The casting operation will do nothing in this case, there won't be a fp to integer conversion instruction being generated.

We on purpose used macro to avoid the builtin call with a float16_t type.

We had this discussion in the past with Tim and he warned us that 

"ACLE itself says that __fp16 (and by extension float16_t) should *not* be permitted as an argument or return type (section 4.1.2)."

Can you send me the failing test case please.

Thanks,
Ana.

-----Original Message-----
From: cfe-commits-bounces at cs.uiuc.edu [mailto:cfe-commits-bounces at cs.uiuc.edu] On Behalf Of Hao Liu
Sent: Thursday, January 23, 2014 2:25 AM
To: t.p.northover at gmail.com; Hao.Liu at arm.com
Cc: cfe-commits at cs.uiuc.edu
Subject: [PATCH] [AArch64]Fix a bug about vset_lane_f16/vget_lane_f16 intrinsics caused by casting from float16_t to int16_t

Hi t.p.northover,

Hi Tim and reviewers,

Currently, vset_lane_f16/vget_lane_f16 are implemented by macro like:
   #define vset_lane_f16(a, b, __c) ( { \
          float16_t __a = (a); float16x4_t __b = (b); \
          int16_t __a1 = (int16_t) __a; \
          ...
There is a cast from float16_t to int16_t (from variable __a to __a1). Obviously, this well change the semantic. We just want bitwise convert, not type cast from float to int. So if we set 3.5 to a vector lane and get from that lane, we'll get incorrect result: 0.

This patch fixes this problem by implementing such intrinsics by builtin functions. The scalar type float16_t is passed directly without cast.

Review, please.

Thanks,
-Hao

http://llvm-reviews.chandlerc.com/D2602

Files:
  include/clang/Basic/arm_neon.td
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGen/aarch64-neon-copy.c
  utils/TableGen/NeonEmitter.cpp

Index: include/clang/Basic/arm_neon.td
===================================================================

--- include/clang/Basic/arm_neon.td
+++ include/clang/Basic/arm_neon.td
@@ -866,9 +866,9 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Extract or insert element from vector  def GET_LANE : IInst<"vget_lane", "sdi",
-                     "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPsfdQfQdPlQPl">;
+                
+ "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPshfdQhQfQdPlQPl">;
 def SET_LANE : IInst<"vset_lane", "dsdi",
-                     "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPsfdQfQdPlQPl">;
+                
+ "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPshfdQhQfQdPlQPl">;
 def COPY_LANE : IOpInst<"vcopy_lane", "ddidi",
                         "csilPcPsUcUsUiUlPcPsPlfd", OP_COPY_LN>;  def COPYQ_LANE : IOpInst<"vcopy_lane", "ddigi", @@ -1340,7 +1340,4 @@
 
 def SCALAR_VDUP_LANE : IInst<"vdup_lane", "sdi", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
 def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "sji", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
-
-def SCALAR_GET_LANE : IOpInst<"vget_lane", "sdi", "hQh", OP_SCALAR_GET_LN>; -def SCALAR_SET_LANE : IOpInst<"vset_lane", "dsdi", "hQh", OP_SCALAR_SET_LN>;  }
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -1876,12 +1876,14 @@
   case AArch64::BI__builtin_neon_vget_lane_i16:
   case AArch64::BI__builtin_neon_vget_lane_i32:
   case AArch64::BI__builtin_neon_vget_lane_i64:
+  case AArch64::BI__builtin_neon_vget_lane_f16:
   case AArch64::BI__builtin_neon_vget_lane_f32:
   case AArch64::BI__builtin_neon_vget_lane_f64:
   case AArch64::BI__builtin_neon_vgetq_lane_i8:
   case AArch64::BI__builtin_neon_vgetq_lane_i16:
   case AArch64::BI__builtin_neon_vgetq_lane_i32:
   case AArch64::BI__builtin_neon_vgetq_lane_i64:
+  case AArch64::BI__builtin_neon_vgetq_lane_f16:
   case AArch64::BI__builtin_neon_vgetq_lane_f32:
   case AArch64::BI__builtin_neon_vgetq_lane_f64:
     return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vget_lane_i8, E); @@ -1889,12 +1891,14 @@
   case AArch64::BI__builtin_neon_vset_lane_i16:
   case AArch64::BI__builtin_neon_vset_lane_i32:
   case AArch64::BI__builtin_neon_vset_lane_i64:
+  case AArch64::BI__builtin_neon_vset_lane_f16:
   case AArch64::BI__builtin_neon_vset_lane_f32:
   case AArch64::BI__builtin_neon_vset_lane_f64:
   case AArch64::BI__builtin_neon_vsetq_lane_i8:
   case AArch64::BI__builtin_neon_vsetq_lane_i16:
   case AArch64::BI__builtin_neon_vsetq_lane_i32:
   case AArch64::BI__builtin_neon_vsetq_lane_i64:
+  case AArch64::BI__builtin_neon_vsetq_lane_f16:
   case AArch64::BI__builtin_neon_vsetq_lane_f32:
   case AArch64::BI__builtin_neon_vsetq_lane_f64:
     return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vset_lane_i8, E);
Index: test/CodeGen/aarch64-neon-copy.c
===================================================================
--- test/CodeGen/aarch64-neon-copy.c
+++ test/CodeGen/aarch64-neon-copy.c
@@ -1274,18 +1274,16 @@
 
 // CHECK: test_vset_lane_f16
 float16x4_t test_vset_lane_f16(float16x4_t v1) {
-  float16_t a;
+  float16_t a = 0;
   return vset_lane_f16(a, v1, 3);
-// CHECK: fmov  {{s[0-9]+}}, wzr
-// CHECK-NEXT: ins {{v[0-9]+}}.h[3],  {{v[0-9]+}}.h[0]
+// CHECK: ins {{v[0-9]+}}.h[3], wzr
 }
 
 // CHECK: test_vsetq_lane_f16
 float16x8_t test_vsetq_lane_f16(float16x8_t v1) {
-  float16_t a;
+  float16_t a = 0;
   return vsetq_lane_f16(a, v1, 7);
-// CHECK: fmov  {{s[0-9]+}}, wzr
-// CHECK-NEXT: ins {{v[0-9]+}}.h[7],  {{v[0-9]+}}.h[0]
+// CHECK: ins {{v[0-9]+}}.h[7], wzr
 }
 
 // CHECK: test_vset_lane_f16_2
@@ -1365,3 +1363,11 @@
 // CHECK-NEXT: ret
 }
 
+// CHECK-LABEL: test_vset_get_lane_f16:
+int test_vset_get_lane_f16(float16x8_t a) {
+  float16x8_t b;
+  b = vsetq_lane_f16(3.5, a, 5);
+  float16_t c = vgetq_lane_f16(b, 5);
+  return (int)c;
+// CHECK: movz x{{[0-9]+}}, #3
+}
Index: utils/TableGen/NeonEmitter.cpp
===================================================================
--- utils/TableGen/NeonEmitter.cpp
+++ utils/TableGen/NeonEmitter.cpp
@@ -760,7 +760,7 @@
   type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);
 
   usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
-                         scal && type != 'f' && type != 'd');
+                         scal && type != 'h' &&type != 'f' && type != 
+ 'd');
 
   // All pointers are void* pointers.  Change type to 'v' now.
   if (pntr) {
@@ -768,11 +768,6 @@
     poly = false;
     type = 'v';
   }
-  // Treat half-float ('h') types as unsigned short ('s') types.
-  if (type == 'h') {
-    type = 's';
-    usgn = true;
-  }
 
   if (scal) {
     SmallString<128> s;
@@ -796,6 +791,12 @@
     return s.str();
   }
 
+  // Treat half-float ('h') types as unsigned short ('s') types.
+  if (type == 'h') {
+    type = 's';
+    usgn = true;
+  }
+
   // Since the return value must be one type, return a vector type of the
   // appropriate width which we will bitcast.  An exception is made for
   // returning structs of 2, 3, or 4 vectors which are returned in a sret-like