[PATCH] [AArch64]Fix a bug about vset_lane_f16/vget_lane_f16 intrinsics caused by casting from float16_t to int16_t

Thu Jan 23 02:25:03 PST 2014

Hi t.p.northover,

Hi Tim and reviewers,

Currently, vset_lane_f16/vget_lane_f16 are implemented by macro like:
   #define vset_lane_f16(a, b, __c) ( { \
          float16_t __a = (a); float16x4_t __b = (b); \
          int16_t __a1 = (int16_t) __a; \
          ...
There is a cast from float16_t to int16_t (from variable __a to __a1). Obviously, this well change the semantic. We just want bitwise convert, not type cast from float to int. So if we set 3.5 to a vector lane and get from that lane, we'll get incorrect result: 0.

This patch fixes this problem by implementing such intrinsics by builtin functions. The scalar type float16_t is passed directly without cast.

Review, please.

Thanks,
-Hao

http://llvm-reviews.chandlerc.com/D2602

Files:
  include/clang/Basic/arm_neon.td
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGen/aarch64-neon-copy.c
  utils/TableGen/NeonEmitter.cpp

Index: include/clang/Basic/arm_neon.td
===================================================================

--- include/clang/Basic/arm_neon.td
+++ include/clang/Basic/arm_neon.td
@@ -866,9 +866,9 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Extract or insert element from vector
 def GET_LANE : IInst<"vget_lane", "sdi",
-                     "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPsfdQfQdPlQPl">;
+                "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPshfdQhQfQdPlQPl">;
 def SET_LANE : IInst<"vset_lane", "dsdi",
-                     "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPsfdQfQdPlQPl">;
+                "csilPcPsUcUsUiUlQcQsQiQlQUcQUsQUiQUlPcPsQPcQPshfdQhQfQdPlQPl">;
 def COPY_LANE : IOpInst<"vcopy_lane", "ddidi",
                         "csilPcPsUcUsUiUlPcPsPlfd", OP_COPY_LN>;
 def COPYQ_LANE : IOpInst<"vcopy_lane", "ddigi",
@@ -1340,7 +1340,4 @@
 
 def SCALAR_VDUP_LANE : IInst<"vdup_lane", "sdi", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
 def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "sji", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
-
-def SCALAR_GET_LANE : IOpInst<"vget_lane", "sdi", "hQh", OP_SCALAR_GET_LN>;
-def SCALAR_SET_LANE : IOpInst<"vset_lane", "dsdi", "hQh", OP_SCALAR_SET_LN>;
 }
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -1876,12 +1876,14 @@
   case AArch64::BI__builtin_neon_vget_lane_i16:
   case AArch64::BI__builtin_neon_vget_lane_i32:
   case AArch64::BI__builtin_neon_vget_lane_i64:
+  case AArch64::BI__builtin_neon_vget_lane_f16:
   case AArch64::BI__builtin_neon_vget_lane_f32:
   case AArch64::BI__builtin_neon_vget_lane_f64:
   case AArch64::BI__builtin_neon_vgetq_lane_i8:
   case AArch64::BI__builtin_neon_vgetq_lane_i16:
   case AArch64::BI__builtin_neon_vgetq_lane_i32:
   case AArch64::BI__builtin_neon_vgetq_lane_i64:
+  case AArch64::BI__builtin_neon_vgetq_lane_f16:
   case AArch64::BI__builtin_neon_vgetq_lane_f32:
   case AArch64::BI__builtin_neon_vgetq_lane_f64:
     return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vget_lane_i8, E);
@@ -1889,12 +1891,14 @@
   case AArch64::BI__builtin_neon_vset_lane_i16:
   case AArch64::BI__builtin_neon_vset_lane_i32:
   case AArch64::BI__builtin_neon_vset_lane_i64:
+  case AArch64::BI__builtin_neon_vset_lane_f16:
   case AArch64::BI__builtin_neon_vset_lane_f32:
   case AArch64::BI__builtin_neon_vset_lane_f64:
   case AArch64::BI__builtin_neon_vsetq_lane_i8:
   case AArch64::BI__builtin_neon_vsetq_lane_i16:
   case AArch64::BI__builtin_neon_vsetq_lane_i32:
   case AArch64::BI__builtin_neon_vsetq_lane_i64:
+  case AArch64::BI__builtin_neon_vsetq_lane_f16:
   case AArch64::BI__builtin_neon_vsetq_lane_f32:
   case AArch64::BI__builtin_neon_vsetq_lane_f64:
     return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vset_lane_i8, E);
Index: test/CodeGen/aarch64-neon-copy.c
===================================================================
--- test/CodeGen/aarch64-neon-copy.c
+++ test/CodeGen/aarch64-neon-copy.c
@@ -1274,18 +1274,16 @@
 
 // CHECK: test_vset_lane_f16
 float16x4_t test_vset_lane_f16(float16x4_t v1) {
-  float16_t a;
+  float16_t a = 0;
   return vset_lane_f16(a, v1, 3);
-// CHECK: fmov  {{s[0-9]+}}, wzr
-// CHECK-NEXT: ins {{v[0-9]+}}.h[3],  {{v[0-9]+}}.h[0]
+// CHECK: ins {{v[0-9]+}}.h[3], wzr
 }
 
 // CHECK: test_vsetq_lane_f16
 float16x8_t test_vsetq_lane_f16(float16x8_t v1) {
-  float16_t a;
+  float16_t a = 0;
   return vsetq_lane_f16(a, v1, 7);
-// CHECK: fmov  {{s[0-9]+}}, wzr
-// CHECK-NEXT: ins {{v[0-9]+}}.h[7],  {{v[0-9]+}}.h[0]
+// CHECK: ins {{v[0-9]+}}.h[7], wzr
 }
 
 // CHECK: test_vset_lane_f16_2
@@ -1365,3 +1363,11 @@
 // CHECK-NEXT: ret
 }
 
+// CHECK-LABEL: test_vset_get_lane_f16:
+int test_vset_get_lane_f16(float16x8_t a) {
+  float16x8_t b;
+  b = vsetq_lane_f16(3.5, a, 5);
+  float16_t c = vgetq_lane_f16(b, 5);
+  return (int)c;
+// CHECK: movz x{{[0-9]+}}, #3
+}
Index: utils/TableGen/NeonEmitter.cpp
===================================================================
--- utils/TableGen/NeonEmitter.cpp
+++ utils/TableGen/NeonEmitter.cpp
@@ -760,7 +760,7 @@
   type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);
 
   usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
-                         scal && type != 'f' && type != 'd');
+                         scal && type != 'h' &&type != 'f' && type != 'd');
 
   // All pointers are void* pointers.  Change type to 'v' now.
   if (pntr) {
@@ -768,11 +768,6 @@
     poly = false;
     type = 'v';
   }
-  // Treat half-float ('h') types as unsigned short ('s') types.
-  if (type == 'h') {
-    type = 's';
-    usgn = true;
-  }
 
   if (scal) {
     SmallString<128> s;
@@ -796,6 +791,12 @@
     return s.str();
   }
 
+  // Treat half-float ('h') types as unsigned short ('s') types.
+  if (type == 'h') {
+    type = 's';
+    usgn = true;
+  }
+
   // Since the return value must be one type, return a vector type of the
   // appropriate width which we will bitcast.  An exception is made for
   // returning structs of 2, 3, or 4 vectors which are returned in a sret-like
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D2602.1.patch
Type: text/x-patch
Size: 5174 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20140123/259f12db/attachment.bin>