[llvm] bef25ae - [X86] X86FixupVectorConstants - use explicit register bitwidth for the loaded vector instead of using constant pool bitwidth

Thu Feb 8 09:39:40 PST 2024

Author: Simon Pilgrim
Date: 2024-02-08T17:39:19Z
New Revision: bef25ae297d6d246bf0fa8667c8b08f9d5e8dae7

URL: https://github.com/llvm/llvm-project/commit/bef25ae297d6d246bf0fa8667c8b08f9d5e8dae7
DIFF: https://github.com/llvm/llvm-project/commit/bef25ae297d6d246bf0fa8667c8b08f9d5e8dae7.diff

LOG: [X86] X86FixupVectorConstants - use explicit register bitwidth for the loaded vector instead of using constant pool bitwidth

Fixes #81136 - we might be loading from a constant pool entry wider than the destination register bitwidth, affecting the vextload scale calculation.

ConvertToBroadcastAVX512 doesn't yet set an explicit bitwidth (it will default to the constant pool bitwidth) due to difficulties in looking up the original register width through the fold tables, but as we only use rebuildSplatCst this shouldn't cause any miscompilations, although it might prevent folding to broadcast if only the lower bits match a splatable pattern.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86FixupVectorConstants.cpp
    llvm/test/CodeGen/X86/pr81136.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 32ca9c164c579..da7dcbb25a957 100644

--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -226,6 +226,7 @@ static Constant *rebuildConstant(LLVMContext &Ctx, Type *SclTy,
 // width, built up of potentially smaller scalar values.
 static Constant *rebuildSplatCst(const Constant *C, unsigned /*NumBits*/,
                                  unsigned /*NumElts*/, unsigned SplatBitWidth) {
+  // TODO: Truncate to NumBits once ConvertToBroadcastAVX512 support this.
   std::optional<APInt> Splat = getSplatableConstant(C, SplatBitWidth);
   if (!Splat)
     return nullptr;
@@ -328,7 +329,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
         RebuildConstant;
   };
-  auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned OperandNo) {
+  auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned RegBitWidth,
+                           unsigned OperandNo) {
 #ifdef EXPENSIVE_CHECKS
     assert(llvm::is_sorted(Fixups,
                            [](const FixupEntry &A, const FixupEntry &B) {
@@ -340,7 +342,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
            "Unexpected number of operands!");
     if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
-      unsigned RegBitWidth = C->getType()->getPrimitiveSizeInBits();
+      RegBitWidth =
+          RegBitWidth ? RegBitWidth : C->getType()->getPrimitiveSizeInBits();
       for (const FixupEntry &Fixup : Fixups) {
         if (Fixup.Op) {
           // Construct a suitable constant and adjust the MI to use the new
@@ -377,7 +380,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     // TODO: SSE3 MOVDDUP Handling
     return FixupConstant({{X86::MOVSSrm, 1, 32, rebuildZeroUpperCst},
                           {X86::MOVSDrm, 1, 64, rebuildZeroUpperCst}},
-                         1);
+                         128, 1);
   case X86::VMOVAPDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVUPDrm:
@@ -386,7 +389,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                           {X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst},
                           {X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst},
                           {X86::VMOVDDUPrm, 1, 64, rebuildSplatCst}},
-                         1);
+                         128, 1);
   case X86::VMOVAPDYrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPDYrm:
@@ -394,7 +397,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     return FixupConstant({{X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst},
                           {X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst},
                           {X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst}},
-                         1);
+                         256, 1);
   case X86::VMOVAPDZ128rm:
   case X86::VMOVAPSZ128rm:
   case X86::VMOVUPDZ128rm:
@@ -403,7 +406,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                           {X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst},
                           {X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
                           {X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
-                         1);
+                         128, 1);
   case X86::VMOVAPDZ256rm:
   case X86::VMOVAPSZ256rm:
   case X86::VMOVUPDZ256rm:
@@ -412,7 +415,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {{X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst},
          {X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
          {X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
-        1);
+        256, 1);
   case X86::VMOVAPDZrm:
   case X86::VMOVAPSZrm:
   case X86::VMOVUPDZrm:
@@ -421,7 +424,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                           {X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst},
                           {X86::VBROADCASTF32X4rm, 1, 128, rebuildSplatCst},
                           {X86::VBROADCASTF64X4rm, 1, 256, rebuildSplatCst}},
-                         1);
+                         512, 1);
     /* Integer Loads */
   case X86::MOVDQArm:
   case X86::MOVDQUrm: {
@@ -440,7 +443,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {HasSSE41 ? X86::PMOVZXWDrm : 0, 4, 16, rebuildZExtCst},
         {HasSSE41 ? X86::PMOVSXDQrm : 0, 2, 32, rebuildSExtCst},
         {HasSSE41 ? X86::PMOVZXDQrm : 0, 2, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 128, 1);
   }
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm: {
@@ -465,7 +468,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDrm, 4, 16, rebuildZExtCst},
         {X86::VPMOVSXDQrm, 2, 32, rebuildSExtCst},
         {X86::VPMOVZXDQrm, 2, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 128, 1);
   }
   case X86::VMOVDQAYrm:
   case X86::VMOVDQUYrm: {
@@ -490,7 +493,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {HasAVX2 ? X86::VPMOVZXWDYrm : 0, 8, 16, rebuildZExtCst},
         {HasAVX2 ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst},
         {HasAVX2 ? X86::VPMOVZXDQYrm : 0, 4, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 256, 1);
   }
   case X86::VMOVDQA32Z128rm:
   case X86::VMOVDQA64Z128rm:
@@ -515,7 +518,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDZ128rm, 4, 16, rebuildZExtCst},
         {X86::VPMOVSXDQZ128rm, 2, 32, rebuildSExtCst},
         {X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 128, 1);
   }
   case X86::VMOVDQA32Z256rm:
   case X86::VMOVDQA64Z256rm:
@@ -539,7 +542,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDZ256rm, 8, 16, rebuildZExtCst},
         {X86::VPMOVSXDQZ256rm, 4, 32, rebuildSExtCst},
         {X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 256, 1);
   }
   case X86::VMOVDQA32Zrm:
   case X86::VMOVDQA64Zrm:
@@ -564,7 +567,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
         {X86::VPMOVZXWDZrm, 16, 16, rebuildZExtCst},
         {X86::VPMOVSXDQZrm, 8, 32, rebuildSExtCst},
         {X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}};
-    return FixupConstant(Fixups, 1);
+    return FixupConstant(Fixups, 512, 1);
   }
   }
 
@@ -592,7 +595,9 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
       unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
       FixupEntry Fixups[] = {{(int)OpBcst32, 32, 32, rebuildSplatCst},
                              {(int)OpBcst64, 64, 64, rebuildSplatCst}};
-      return FixupConstant(Fixups, OpNo);
+      // TODO: Add support for RegBitWidth, but currently rebuildSplatCst
+      // doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
+      return FixupConstant(Fixups, 0, OpNo);
     }
     return false;
   };

diff  --git a/llvm/test/CodeGen/X86/pr81136.ll b/llvm/test/CodeGen/X86/pr81136.ll
index 8843adca0933c..b4ac3fc783e0a 100644
--- a/llvm/test/CodeGen/X86/pr81136.ll
+++ b/llvm/test/CodeGen/X86/pr81136.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s
 
-; FIXME: Should be vpmovzxbq[128,1] instead of vpmovzxbd[128,1,0,0]
 define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) {
 ; CHECK-LABEL: PR81136:
 ; CHECK:       # %bb.0:
@@ -9,7 +8,7 @@ define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) {
 ; CHECK-NEXT:    vmovd %esi, %xmm1
 ; CHECK-NEXT:    vmovdqa (%rdx), %ymm2
 ; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm4 = [128,1,0,0]
+; CHECK-NEXT:    vpmovzxbq {{.*#+}} xmm4 = [128,1]
 ; CHECK-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm4
 ; CHECK-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1