[llvm] [ARM] Fix VBICimm and VORRimm generation under Big endian. (PR #107813)

Sun Sep 8 23:41:56 PDT 2024

https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/107813

This is a smaller follow on to #105519 that fixes VBICimm and VORRimm too. The logic behind lowering vector immediates under big endian Neon/MVE is to treat them in natural lane ordering (same as little endian), and VECTOR_REG_CAST them to the correct type (as opposed to creating the constants in big endian form and bitcasting them). This makes sure that is done when creating VORRIMM and VBICIMM.

>From 2f5604b7b3afc3a5693e297168a2d93c9cc529d9 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 9 Sep 2024 07:40:28 +0100
Subject: [PATCH] [ARM] Fix VBICimm and VORRimm generation under Big endian.

This is a smaller follow on to #105519 that fixes VBICimm and VORRimm too. The
logic behind lowering vector immediates under big endian Neon/MVE is to treat
them in natural lane ordering (same as little endian), and VECTOR_REG_CAST them
to the correct type (as opposed to creating the constants in big endian form
and bitcasting them). This makes sure that is done when creating VORRIMM and
VBICIMM.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp  |  8 ++++----
 llvm/test/CodeGen/ARM/big-endian-vmov.ll | 10 ++++------
 llvm/test/CodeGen/Thumb2/mve-vmovimm.ll  | 10 ++++------
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 8c82161bd15c65..67653a17512d2c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14443,9 +14443,9 @@ static SDValue PerformANDCombine(SDNode *N,
                                       DAG, dl, VbicVT, VT, OtherModImm);
       if (Val.getNode()) {
         SDValue Input =
-          DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
+            DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
-        return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
+        return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
       }
     }
   }
@@ -14739,9 +14739,9 @@ static SDValue PerformORCombine(SDNode *N,
                             SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
       if (Val.getNode()) {
         SDValue Input =
-          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
+            DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
-        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
+        return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
       }
     }
   }
diff --git a/llvm/test/CodeGen/ARM/big-endian-vmov.ll b/llvm/test/CodeGen/ARM/big-endian-vmov.ll
index 327d4334ad83a1..e3647cf6234c82 100644
--- a/llvm/test/CodeGen/ARM/big-endian-vmov.ll
+++ b/llvm/test/CodeGen/ARM/big-endian-vmov.ll
@@ -219,7 +219,6 @@ define arm_aapcs_vfpcc <8 x i16> @vmvn_v16i8_m1() {
   ret <8 x i16> <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
 }
 
-; FIXME: This is incorrect for BE
 define arm_aapcs_vfpcc <8 x i16> @and_v8i16_m1(<8 x i16> %a) {
 ; CHECK-LE-LABEL: and_v8i16_m1:
 ; CHECK-LE:       @ %bb.0:
@@ -228,15 +227,14 @@ define arm_aapcs_vfpcc <8 x i16> @and_v8i16_m1(<8 x i16> %a) {
 ;
 ; CHECK-BE-LABEL: and_v8i16_m1:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vrev64.32 q8, q0
+; CHECK-BE-NEXT:    vrev64.16 q8, q0
 ; CHECK-BE-NEXT:    vbic.i32 q8, #0x10000
-; CHECK-BE-NEXT:    vrev64.32 q0, q8
+; CHECK-BE-NEXT:    vrev64.16 q0, q8
 ; CHECK-BE-NEXT:    bx lr
   %b = and <8 x i16> %a, <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
   ret <8 x i16> %b
 }
 
-; FIXME: This is incorrect for BE
 define arm_aapcs_vfpcc <8 x i16> @or_v8i16_1(<8 x i16> %a) {
 ; CHECK-LE-LABEL: or_v8i16_1:
 ; CHECK-LE:       @ %bb.0:
@@ -245,9 +243,9 @@ define arm_aapcs_vfpcc <8 x i16> @or_v8i16_1(<8 x i16> %a) {
 ;
 ; CHECK-BE-LABEL: or_v8i16_1:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vrev64.32 q8, q0
+; CHECK-BE-NEXT:    vrev64.16 q8, q0
 ; CHECK-BE-NEXT:    vorr.i32 q8, #0x10000
-; CHECK-BE-NEXT:    vrev64.32 q0, q8
+; CHECK-BE-NEXT:    vrev64.16 q0, q8
 ; CHECK-BE-NEXT:    bx lr
   %b = or <8 x i16> %a, <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>
   ret <8 x i16> %b
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll
index 9cf92663e3b052..1970ff35f183fa 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll
@@ -1331,7 +1331,6 @@ entry:
   ret <2 x i64> %s
 }
 
-; FIXME: This is incorrect for BE
 define arm_aapcs_vfpcc <8 x i16> @and_v8i16_m1(<8 x i16> %a) {
 ; CHECKLE-LABEL: and_v8i16_m1:
 ; CHECKLE:       @ %bb.0:
@@ -1340,15 +1339,14 @@ define arm_aapcs_vfpcc <8 x i16> @and_v8i16_m1(<8 x i16> %a) {
 ;
 ; CHECKBE-LABEL: and_v8i16_m1:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vrev64.32 q1, q0
+; CHECKBE-NEXT:    vrev64.16 q1, q0
 ; CHECKBE-NEXT:    vbic.i32 q1, #0x10000
-; CHECKBE-NEXT:    vrev64.32 q0, q1
+; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %b = and <8 x i16> %a, <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
   ret <8 x i16> %b
 }
 
-; FIXME: This is incorrect for BE
 define arm_aapcs_vfpcc <8 x i16> @or_v8i16_1(<8 x i16> %a) {
 ; CHECKLE-LABEL: or_v8i16_1:
 ; CHECKLE:       @ %bb.0:
@@ -1357,9 +1355,9 @@ define arm_aapcs_vfpcc <8 x i16> @or_v8i16_1(<8 x i16> %a) {
 ;
 ; CHECKBE-LABEL: or_v8i16_1:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vrev64.32 q1, q0
+; CHECKBE-NEXT:    vrev64.16 q1, q0
 ; CHECKBE-NEXT:    vorr.i32 q1, #0x10000
-; CHECKBE-NEXT:    vrev64.32 q0, q1
+; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %b = or <8 x i16> %a, <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>
   ret <8 x i16> %b