[PATCH] D41863: [AArch64] Fix incorrect LD1 of 16-bit FP vectors in big endian

Tue Jan 9 07:25:55 PST 2018

pbarrio created this revision.
pbarrio added reviewers: craig.topper, jmolloy, olista01.
Herald added subscribers: kristof.beyls, javed.absar, rengolin, aemerson.

Loading a vector of 4 half-precision FP sometimes results in an LD1
of 2 single-precision FP + a reversal. This results in an incorrect
byte swap due to the conversion from little endian to big endian.

In order to generate the correct byte swap, it is easier to
generate the correct LD1 of 4 half-precision FP, thus avoiding the
subsequent reversal.


https://reviews.llvm.org/D41863

Files:
  lib/Target/AArch64/AArch64ISelLowering.cpp
  lib/Target/AArch64/AArch64InstrInfo.td
  test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll


Index: test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
===================================================================

--- test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
+++ test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -1099,3 +1099,17 @@
     store <16 x i8> %4, <16 x i8>* %q
     ret void
 }
+
+; CHECK-LABEL: test_v4f16_struct:
+%struct.struct1 = type { half, half, half, half }
+define %struct.struct1 @test_v4f16_struct(%struct.struct1* %ret) {
+entry:
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK-NOT: ld1 { {{v[0-9]+}}.2s }
+; CHECK-NOT: rev64
+  %0 = bitcast %struct.struct1* %ret to <4 x half>*
+  %1 = load <4 x half>, <4 x half>* %0, align 2
+  %2 = extractelement <4 x half> %1, i32 0
+  %.fca.0.insert = insertvalue %struct.struct1 undef, half %2, 0
+  ret %struct.struct1 %.fca.0.insert
+}
Index: lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.td
+++ lib/Target/AArch64/AArch64InstrInfo.td
@@ -5849,7 +5849,7 @@
 def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
-                             (v4f16 (REV64v4i16 FPR64:$src))>;
+                             (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))),
                              (v4f16 (REV16v8i8 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))),
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -721,12 +721,18 @@
 }
 
 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32 || VT == MVT::v4f16) {
+  if (VT == MVT::v2f32) {
     setOperationAction(ISD::LOAD, VT, Promote);
     AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
 
     setOperationAction(ISD::STORE, VT, Promote);
     AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
+  } else if (VT == MVT::v4f16) {
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType(ISD::LOAD, VT, MVT::v4i16);
+
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType(ISD::STORE, VT, MVT::v4i16);
   } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
     setOperationAction(ISD::LOAD, VT, Promote);
     AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D41863.129083.patch
Type: text/x-patch
Size: 2506 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180109/cddab71a/attachment.bin>