[llvm] 7b021f2 - [PowerPC] Optimize VPERM and fix code order for swapping vector operands on LE

Maryam Moghadas via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 13 13:00:54 PDT 2023


Author: Maryam Moghadas
Date: 2023-09-13T15:00:49-05:00
New Revision: 7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79

URL: https://github.com/llvm/llvm-project/commit/7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79
DIFF: https://github.com/llvm/llvm-project/commit/7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.diff

LOG: [PowerPC] Optimize VPERM and fix code order for swapping vector operands on LE

This patch reverts commit 7614ba0a5db8 to optimize VPERM when one of its
vector operands is XXSWAPD, similar to XXPERM. It also reorganizes the
little-endian swap code on LE, swapping the vector operand after
adjusting the mask operand. This ensures that the vector operand is
swapped at the correct point in the code, resulting in a valid
constant pool for the mask operand.

Reviewed By: stefanp

Differential Revision: https://reviews.llvm.org/D149083

Added: 
    

Modified: 
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/test/CodeGen/PowerPC/build-vector-tests.ll
    llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
    llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
    llvm/test/CodeGen/PowerPC/vperm-swap.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 0ebdd77f8a311b3..95f2243178c8a10 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10314,11 +10314,6 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
   bool isLittleEndian = Subtarget.isLittleEndian();
   bool isPPC64 = Subtarget.isPPC64();
 
-  // Only need to place items backwards in LE,
-  // the mask will be properly calculated.
-  if (isLittleEndian)
-    std::swap(V1, V2);
-
   if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&
       (V1->hasOneUse() || V2->hasOneUse())) {
     LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "
@@ -10328,7 +10323,8 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
     // The second input to XXPERM is also an output so if the second input has
     // multiple uses then copying is necessary, as a result we want the
     // single-use operand to be used as the second input to prevent copying.
-    if (!V2->hasOneUse() && V1->hasOneUse()) {
+    if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) ||
+        (isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) {
       std::swap(V1, V2);
       NeedSwap = !NeedSwap;
     }
@@ -10367,27 +10363,24 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
     unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
 
-    if (Opcode == PPCISD::XXPERM) {
-      if (V1HasXXSWAPD) {
-        if (SrcElt < 8)
-          SrcElt += 8;
-        else if (SrcElt < 16)
-          SrcElt -= 8;
-      }
-      if (V2HasXXSWAPD) {
-        if (SrcElt > 23)
-          SrcElt -= 8;
-        else if (SrcElt > 15)
-          SrcElt += 8;
-      }
-      if (NeedSwap) {
-        if (SrcElt < 16)
-          SrcElt += 16;
-        else
-          SrcElt -= 16;
-      }
+    if (V1HasXXSWAPD) {
+      if (SrcElt < 8)
+        SrcElt += 8;
+      else if (SrcElt < 16)
+        SrcElt -= 8;
+    }
+    if (V2HasXXSWAPD) {
+      if (SrcElt > 23)
+        SrcElt -= 8;
+      else if (SrcElt > 15)
+        SrcElt += 8;
+    }
+    if (NeedSwap) {
+      if (SrcElt < 16)
+        SrcElt += 16;
+      else
+        SrcElt -= 16;
     }
-
     for (unsigned j = 0; j != BytesPerElement; ++j)
       if (isLittleEndian)
         ResultMask.push_back(
@@ -10397,18 +10390,19 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
             DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32));
   }
 
-  if (Opcode == PPCISD::XXPERM && (V1HasXXSWAPD || V2HasXXSWAPD)) {
-    if (V1HasXXSWAPD) {
-      dl = SDLoc(V1->getOperand(0));
-      V1 = V1->getOperand(0)->getOperand(1);
-    }
-    if (V2HasXXSWAPD) {
-      dl = SDLoc(V2->getOperand(0));
-      V2 = V2->getOperand(0)->getOperand(1);
-    }
-    if (isPPC64 && ValType != MVT::v2f64)
+  if (V1HasXXSWAPD) {
+    dl = SDLoc(V1->getOperand(0));
+    V1 = V1->getOperand(0)->getOperand(1);
+  }
+  if (V2HasXXSWAPD) {
+    dl = SDLoc(V2->getOperand(0));
+    V2 = V2->getOperand(0)->getOperand(1);
+  }
+
+  if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) {
+    if (ValType != MVT::v2f64)
       V1 = DAG.getBitcast(MVT::v2f64, V1);
-    if (isPPC64 && V2.getValueType() != MVT::v2f64)
+    if (V2.getValueType() != MVT::v2f64)
       V2 = DAG.getBitcast(MVT::v2f64, V2);
   }
 
@@ -10429,6 +10423,11 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
   if (Opcode == PPCISD::XXPERM)
     VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask);
 
+  // Only need to place items backwards in LE,
+  // the mask was properly calculated.
+  if (isLittleEndian)
+    std::swap(V1, V2);
+
   SDValue VPERMNode =
       DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask);
 

diff  --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
index 6410738af6c6e0e..f729018dd41061e 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1058,16 +1058,15 @@ define <4 x i32> @fromDiffMemVarDi(ptr nocapture readonly %arr, i32 signext %ele
 ;
 ; P8LE-LABEL: fromDiffMemVarDi:
 ; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    addis r5, r2, .LCPI9_0 at toc@ha
 ; P8LE-NEXT:    sldi r4, r4, 2
+; P8LE-NEXT:    addi r5, r5, .LCPI9_0 at toc@l
 ; P8LE-NEXT:    add r3, r3, r4
+; P8LE-NEXT:    lxvd2x vs0, 0, r5
 ; P8LE-NEXT:    addi r3, r3, -12
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    addis r3, r2, .LCPI9_0 at toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI9_0 at toc@l
+; P8LE-NEXT:    lxvd2x v3, 0, r3
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    xxswapd v3, vs0
-; P8LE-NEXT:    vperm v2, v2, v2, v3
+; P8LE-NEXT:    vperm v2, v3, v3, v2
 ; P8LE-NEXT:    blr
 entry:
   %idxprom = sext i32 %elem to i64
@@ -1478,13 +1477,12 @@ define <4 x i32> @fromDiffMemConsDConvftoi(ptr nocapture readonly %ptr) {
 ;
 ; P8LE-LABEL: fromDiffMemConsDConvftoi:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    addis r3, r2, .LCPI18_0 at toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI18_0 at toc@l
+; P8LE-NEXT:    addis r4, r2, .LCPI18_0 at toc@ha
+; P8LE-NEXT:    lxvd2x v3, 0, r3
+; P8LE-NEXT:    addi r4, r4, .LCPI18_0 at toc@l
+; P8LE-NEXT:    lxvd2x vs0, 0, r4
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    xxswapd v3, vs0
-; P8LE-NEXT:    vperm v2, v2, v2, v3
+; P8LE-NEXT:    vperm v2, v3, v3, v2
 ; P8LE-NEXT:    xvcvspsxws v2, v2
 ; P8LE-NEXT:    blr
 entry:
@@ -2580,16 +2578,15 @@ define <4 x i32> @fromDiffMemVarDui(ptr nocapture readonly %arr, i32 signext %el
 ;
 ; P8LE-LABEL: fromDiffMemVarDui:
 ; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    addis r5, r2, .LCPI41_0 at toc@ha
 ; P8LE-NEXT:    sldi r4, r4, 2
+; P8LE-NEXT:    addi r5, r5, .LCPI41_0 at toc@l
 ; P8LE-NEXT:    add r3, r3, r4
+; P8LE-NEXT:    lxvd2x vs0, 0, r5
 ; P8LE-NEXT:    addi r3, r3, -12
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    addis r3, r2, .LCPI41_0 at toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI41_0 at toc@l
+; P8LE-NEXT:    lxvd2x v3, 0, r3
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    xxswapd v3, vs0
-; P8LE-NEXT:    vperm v2, v2, v2, v3
+; P8LE-NEXT:    vperm v2, v3, v3, v2
 ; P8LE-NEXT:    blr
 entry:
   %idxprom = sext i32 %elem to i64
@@ -3000,13 +2997,12 @@ define <4 x i32> @fromDiffMemConsDConvftoui(ptr nocapture readonly %ptr) {
 ;
 ; P8LE-LABEL: fromDiffMemConsDConvftoui:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    addis r3, r2, .LCPI50_0 at toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI50_0 at toc@l
+; P8LE-NEXT:    addis r4, r2, .LCPI50_0 at toc@ha
+; P8LE-NEXT:    lxvd2x v3, 0, r3
+; P8LE-NEXT:    addi r4, r4, .LCPI50_0 at toc@l
+; P8LE-NEXT:    lxvd2x vs0, 0, r4
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    xxswapd v3, vs0
-; P8LE-NEXT:    vperm v2, v2, v2, v3
+; P8LE-NEXT:    vperm v2, v3, v3, v2
 ; P8LE-NEXT:    xvcvspuxws v2, v2
 ; P8LE-NEXT:    blr
 entry:

diff  --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
index 25697311df37381..11cc8abd2c7fa3d 100644
--- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
@@ -183,14 +183,13 @@ entry:
 define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v16i8:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI2_0 at toc@ha
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI2_0 at toc@ha
+; CHECK-LE-P8-NEXT:    lxvd2x v3, 0, r4
 ; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
-; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI2_0 at toc@l
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI2_0 at toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
 ; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v3, v2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v16i8:
@@ -431,14 +430,13 @@ entry:
 define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v8i16:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI5_0 at toc@ha
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI5_0 at toc@ha
+; CHECK-LE-P8-NEXT:    lxvd2x v3, 0, r4
 ; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
-; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI5_0 at toc@l
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI5_0 at toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
 ; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v3, v2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v8i16:

diff  --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
index 37820afeae082fa..201bc5be545068a 100644
--- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
@@ -469,19 +469,18 @@ entry:
 define void @test_none_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) {
 ; CHECK-LE-P8-LABEL: test_none_v2i64:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI4_0 at toc@ha
-; CHECK-LE-P8-NEXT:    lxsdx v4, 0, r3
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI4_0 at toc@ha
+; CHECK-LE-P8-NEXT:    lxsdx v3, 0, r3
 ; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI4_1 at toc@ha
-; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI4_0 at toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x v4, 0, r4
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI4_0 at toc@l
 ; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI4_1 at toc@l
-; CHECK-LE-P8-NEXT:    lxvd2x vs1, 0, r4
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
 ; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
-; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v3, v4, v2
 ; CHECK-LE-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
 ; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
 ; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
 ; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3

diff  --git a/llvm/test/CodeGen/PowerPC/vperm-swap.ll b/llvm/test/CodeGen/PowerPC/vperm-swap.ll
index 0a3b5ae613db8a5..1f979783e5f0b26 100644
--- a/llvm/test/CodeGen/PowerPC/vperm-swap.ll
+++ b/llvm/test/CodeGen/PowerPC/vperm-swap.ll
@@ -4,32 +4,31 @@
 
 define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8: .LCPI0_0:
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   30                              # 0x1e
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   22                              # 0x16
 ; CHECK-LE-P8-NEXT: .byte   7                               # 0x7
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
 ; CHECK-LE-P8-LABEL: test_none_v16i8:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI0_0 at toc@ha
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI0_0 at toc@ha
+; CHECK-LE-P8-NEXT:    lxvd2x v3, 0, r4
 ; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
-; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI0_0 at toc@l
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI0_0 at toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
 ; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v3, v2
 ; CHECK-LE-P8-NEXT:    blr
 entry:
   %lhs = load <16 x i8>, ptr %b, align 4


        


More information about the llvm-commits mailing list