[llvm] 602f81e - [AArch64] Fix zero element TBL indices

Thu May 19 05:54:47 PDT 2022

Author: David Green
Date: 2022-05-19T13:54:35+01:00
New Revision: 602f81ec336330f97e22442b98035c6f007cac6d

URL: https://github.com/llvm/llvm-project/commit/602f81ec336330f97e22442b98035c6f007cac6d
DIFF: https://github.com/llvm/llvm-project/commit/602f81ec336330f97e22442b98035c6f007cac6d.diff

LOG: [AArch64] Fix zero element TBL indices

A TBL instruction will fill out-of-range values with 0's, something used
in D121139 to turn tbl2 with a zero input into tbl1s. This works OK for
v16i8, but for v8i8 the input is still treated as a v16i8, so
out-of-range values (like a lane index of 8) would end up loading values
from the top half of the input register. Clean this up by detecting the
out of range values and making sure they really use out of range values.
There is a fix for swapped indices of 64bit input vectors too, which
could be incorrectly adjusted if the zerovector was the first operand.

Fixes #55545

Differential Revision: https://reviews.llvm.org/D125865

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8e504173e261..503ac7cb359e 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9939,30 +9939,34 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
     Swap = true;
   }
 
+  // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
+  // out of range values with 0s. We do need to make sure that any out-of-range
+  // values are really out-of-range for a v16i8 vector.
+  bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
+  MVT IndexVT = MVT::v8i8;
+  unsigned IndexLen = 8;
+  if (Op.getValueSizeInBits() == 128) {
+    IndexVT = MVT::v16i8;
+    IndexLen = 16;
+  }
+
   SmallVector<SDValue, 8> TBLMask;
   for (int Val : ShuffleMask) {
     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
       unsigned Offset = Byte + Val * BytesPerElt;
       if (Swap)
-        Offset = Offset < 16 ? Offset + 16 : Offset - 16;
+        Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
+      if (IsUndefOrZero && Offset >= IndexLen)
+        Offset = 255;
       TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
     }
   }
 
-  MVT IndexVT = MVT::v8i8;
-  unsigned IndexLen = 8;
-  if (Op.getValueSizeInBits() == 128) {
-    IndexVT = MVT::v16i8;
-    IndexLen = 16;
-  }
-
   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
 
   SDValue Shuffle;
-  // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
-  // out of range values with 0s.
-  if (V2.isUndef() || isZerosVector(V2.getNode())) {
+  if (IsUndefOrZero) {
     if (IndexLen == 8)
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
     Shuffle = DAG.getNode(

diff  --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 6aa07e7846a6..069767da7b49 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -909,9 +909,9 @@ define <8 x i8> @vselect_equivalent_shuffle_v8i8(<8 x i8> %a, <8 x i8> %b) {
 
 ; CHECK-LABEL: .LCPI90_0:
 ; CHECK-NEXT: .byte   0
-; CHECK-NEXT: .byte   8
+; CHECK-NEXT: .byte   255
 ; CHECK-NEXT: .byte   2
-; CHECK-NEXT: .byte   9
+; CHECK-NEXT: .byte   255
 ; CHECK-NEXT: .byte   4
 ; CHECK-NEXT: .byte   5
 ; CHECK-NEXT: .byte   6
@@ -930,14 +930,14 @@ define <8 x i8> @vselect_equivalent_shuffle_v8i8_zero(<8 x i8> %a) {
 }
 
 ; CHECK-LABEL: .LCPI91_0:
-; CHECK-NEXT: .byte   24
-; CHECK-NEXT: .byte   16
-; CHECK-NEXT: .byte   26
-; CHECK-NEXT: .byte   17
-; CHECK-NEXT: .byte   28
-; CHECK-NEXT: .byte   29
-; CHECK-NEXT: .byte   30
-; CHECK-NEXT: .byte   31
+; CHECK-NEXT: .byte   0
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   2
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   4
+; CHECK-NEXT: .byte   5
+; CHECK-NEXT: .byte   6
+; CHECK-NEXT: .byte   7
 define <8 x i8> @vselect_equivalent_shuffle_v8i8_zeroswap(<8 x i8> %a) {
 ; CHECK-LABEL: vselect_equivalent_shuffle_v8i8_zeroswap:
 ; CHECK:       // %bb.0:
@@ -984,12 +984,12 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: .LCPI93_0:
 ; CHECK-NEXT: .byte   0
 ; CHECK-NEXT: .byte   1
-; CHECK-NEXT: .byte   16
-; CHECK-NEXT: .byte   17
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   255
 ; CHECK-NEXT: .byte   4
 ; CHECK-NEXT: .byte   5
-; CHECK-NEXT: .byte   18
-; CHECK-NEXT: .byte   19
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   255
 ; CHECK-NEXT: .byte   8
 ; CHECK-NEXT: .byte   9
 ; CHECK-NEXT: .byte   10
@@ -1011,12 +1011,12 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16_zero(<8 x i16> %a) {
 
 ; CHECK: .byte   0
 ; CHECK: .byte   1
-; CHECK: .byte   16
-; CHECK: .byte   17
+; CHECK: .byte   255
+; CHECK: .byte   255
 ; CHECK: .byte   4
 ; CHECK: .byte   5
-; CHECK: .byte   18
-; CHECK: .byte   19
+; CHECK: .byte   255
+; CHECK: .byte   255
 ; CHECK: .byte   8
 ; CHECK: .byte   9
 ; CHECK: .byte   10