[llvm] 1552b91 - [X86] X86FixupVectorConstantsPass - attempt to match VEX logic ops back to EVEX if we can create a broadcast fold

Tue Nov 21 10:01:53 PST 2023

Author: Simon Pilgrim
Date: 2023-11-21T18:01:29Z
New Revision: 1552b91162bbb410971e2d4e5ec7afd1c7cc932f

URL: https://github.com/llvm/llvm-project/commit/1552b91162bbb410971e2d4e5ec7afd1c7cc932f
DIFF: https://github.com/llvm/llvm-project/commit/1552b91162bbb410971e2d4e5ec7afd1c7cc932f.diff

LOG: [X86] X86FixupVectorConstantsPass - attempt to match VEX logic ops back to EVEX if we can create a broadcast fold

On non-DQI AVX512 targets, X86InstrInfo::setExecutionDomainCustom will convert EVEX int-domain instructions to VEX fp-domain instructions. But, if we have the chance to use a broadcast fold we're better off using a EVEX instruction, so handle a reverse fold.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86FixupVectorConstants.cpp
    llvm/test/CodeGen/X86/combine-abs.ll
    llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
    llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index d4d5cd8c3e16a4f..326e09a1254a0b8 100644

--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -233,6 +233,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
   bool HasAVX2 = ST->hasAVX2();
   bool HasDQI = ST->hasDQI();
   bool HasBWI = ST->hasBWI();
+  bool HasVLX = ST->hasVLX();
 
   auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
                                 unsigned OpBcst64, unsigned OpBcst32,
@@ -352,20 +353,22 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         1);
   }
 
-  // Attempt to find a AVX512 mapping from a full width memory-fold instruction
-  // to a broadcast-fold instruction variant.
-  if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX) {
+  auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
     unsigned OpBcst32 = 0, OpBcst64 = 0;
     unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
-    if (const X86MemoryFoldTableEntry *Mem2Bcst =
-            llvm::lookupBroadcastFoldTable(Opc, 32)) {
-      OpBcst32 = Mem2Bcst->DstOp;
-      OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
+    if (OpSrc32) {
+      if (const X86MemoryFoldTableEntry *Mem2Bcst =
+              llvm::lookupBroadcastFoldTable(OpSrc32, 32)) {
+        OpBcst32 = Mem2Bcst->DstOp;
+        OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
+      }
     }
-    if (const X86MemoryFoldTableEntry *Mem2Bcst =
-            llvm::lookupBroadcastFoldTable(Opc, 64)) {
-      OpBcst64 = Mem2Bcst->DstOp;
-      OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
+    if (OpSrc64) {
+      if (const X86MemoryFoldTableEntry *Mem2Bcst =
+              llvm::lookupBroadcastFoldTable(OpSrc64, 64)) {
+        OpBcst64 = Mem2Bcst->DstOp;
+        OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
+      }
     }
     assert(((OpBcst32 == 0) || (OpBcst64 == 0) || (OpNoBcst32 == OpNoBcst64)) &&
            "OperandNo mismatch");
@@ -374,6 +377,70 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
       unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
       return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo);
     }
+    return false;
+  };
+
+  // Attempt to find a AVX512 mapping from a full width memory-fold instruction
+  // to a broadcast-fold instruction variant.
+  if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX)
+    return ConvertToBroadcastAVX512(Opc, Opc);
+
+  // Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic
+  // conversion to see if we can convert to a broadcasted (integer) logic op.
+  if (HasVLX && !HasDQI) {
+    unsigned OpSrc32 = 0, OpSrc64 = 0;
+    switch (Opc) {
+    case X86::VANDPDrm:
+    case X86::VANDPSrm:
+    case X86::VPANDrm:
+      OpSrc32 = X86 ::VPANDDZ128rm;
+      OpSrc64 = X86 ::VPANDQZ128rm;
+      break;
+    case X86::VANDPDYrm:
+    case X86::VANDPSYrm:
+    case X86::VPANDYrm:
+      OpSrc32 = X86 ::VPANDDZ256rm;
+      OpSrc64 = X86 ::VPANDQZ256rm;
+      break;
+    case X86::VANDNPDrm:
+    case X86::VANDNPSrm:
+    case X86::VPANDNrm:
+      OpSrc32 = X86 ::VPANDNDZ128rm;
+      OpSrc64 = X86 ::VPANDNQZ128rm;
+      break;
+    case X86::VANDNPDYrm:
+    case X86::VANDNPSYrm:
+    case X86::VPANDNYrm:
+      OpSrc32 = X86 ::VPANDNDZ256rm;
+      OpSrc64 = X86 ::VPANDNQZ256rm;
+      break;
+    case X86::VORPDrm:
+    case X86::VORPSrm:
+    case X86::VPORrm:
+      OpSrc32 = X86 ::VPORDZ128rm;
+      OpSrc64 = X86 ::VPORQZ128rm;
+      break;
+    case X86::VORPDYrm:
+    case X86::VORPSYrm:
+    case X86::VPORYrm:
+      OpSrc32 = X86 ::VPORDZ256rm;
+      OpSrc64 = X86 ::VPORQZ256rm;
+      break;
+    case X86::VXORPDrm:
+    case X86::VXORPSrm:
+    case X86::VPXORrm:
+      OpSrc32 = X86 ::VPXORDZ128rm;
+      OpSrc64 = X86 ::VPXORQZ128rm;
+      break;
+    case X86::VXORPDYrm:
+    case X86::VXORPSYrm:
+    case X86::VPXORYrm:
+      OpSrc32 = X86 ::VPXORDZ256rm;
+      OpSrc64 = X86 ::VPXORQZ256rm;
+      break;
+    }
+    if (OpSrc32 || OpSrc64)
+      return ConvertToBroadcastAVX512(OpSrc32, OpSrc64);
   }
 
   return false;

diff  --git a/llvm/test/CodeGen/X86/combine-abs.ll b/llvm/test/CodeGen/X86/combine-abs.ll
index 410218b33eb9c9e..202c88109eaeb2e 100644
--- a/llvm/test/CodeGen/X86/combine-abs.ll
+++ b/llvm/test/CodeGen/X86/combine-abs.ll
@@ -164,10 +164,20 @@ define <16 x i8> @combine_v16i8_abs_constant(<16 x i8> %a) {
 ; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_v16i8_abs_constant:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_v16i8_abs_constant:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: combine_v16i8_abs_constant:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: combine_v16i8_abs_constant:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
   %1 = insertelement <16 x i8> undef, i8 15, i32 0
   %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
   %3 = and <16 x i8> %a, %2

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 57a3c95f31717f5..ec75631a9b5ed29 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -711,10 +711,15 @@ define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(
 ; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
   ret <16 x i8> %shuffle
 }

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 1e7745a4b8836be..f5c5ba663175041 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2362,10 +2362,25 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) {
 }
 
 define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
-; ALL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; XOP-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; XOP-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
   ret <32 x i8> %shuffle
 }