[llvm] [X86] Fold blend(pshufb(x,m1),pshufb(y,m2)) -> blend(pshufb(x,blend(m1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool (PR #98466)
    Simon Pilgrim via llvm-commits 
    llvm-commits at lists.llvm.org
       
    Tue Jul 16 03:34:43 PDT 2024
    
        - Previous message: [llvm] [X86] Fold blend(pshufb(x,m1),pshufb(y,m2)) -> blend(pshufb(x,blend(m1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool (PR #98466)
- Next message: [llvm] [X86] Fold blend(pshufb(x,m1),pshufb(y,m2)) -> blend(pshufb(x,blend(m1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool (PR #98466)
-  Messages sorted by: 
              [ date ]
              [ thread ]
              [ subject ]
              [ author ]
         
  
================
@@ -41016,23 +41016,59 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
   case X86ISD::BLENDI: {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
-
-    // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
-    // TODO: Handle MVT::v16i16 repeated blend mask.
-    if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
-        N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
-      MVT SrcVT = N0.getOperand(0).getSimpleValueType();
-      if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
-          SrcVT.getScalarSizeInBits() >= 32) {
-        unsigned Size = VT.getVectorNumElements();
-        unsigned NewSize = SrcVT.getVectorNumElements();
-        APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
-        APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
-        return DAG.getBitcast(
-            VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
-                            N1.getOperand(0),
-                            DAG.getTargetConstant(NewBlendMask.getZExtValue(),
-                                                  DL, MVT::i8)));
+    unsigned EltBits = VT.getScalarSizeInBits();
+
+    if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
+      // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
+      // TODO: Handle MVT::v16i16 repeated blend mask.
+      if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
+        MVT SrcVT = N0.getOperand(0).getSimpleValueType();
+        unsigned SrcBits = SrcVT.getScalarSizeInBits();
+        if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
+          unsigned Size = VT.getVectorNumElements();
+          unsigned NewSize = SrcVT.getVectorNumElements();
+          APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
+          APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
+          return DAG.getBitcast(
+              VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
+                              N1.getOperand(0),
+                              DAG.getTargetConstant(NewBlendMask.getZExtValue(),
+                                                    DL, MVT::i8)));
+        }
+      }
+      // Share PSHUFB masks:
+      // blend(pshufb(x,m1),pshufb(y,m2))
+      // --> m3 = blend(m1,m2)
+      //     blend(pshufb(x,m3),pshufb(y,m3))
+      if (N0.hasOneUse() && N1.hasOneUse()) {
+        SmallVector<int> Mask, ByteMask;
+        SmallVector<SDValue> Ops;
+        SDValue LHS = peekThroughOneUseBitcasts(N0);
+        SDValue RHS = peekThroughOneUseBitcasts(N1);
+        if (LHS.getOpcode() == X86ISD::PSHUFB &&
+            RHS.getOpcode() == X86ISD::PSHUFB &&
+            LHS.getOperand(1) != RHS.getOperand(1) &&
+            (LHS.getOperand(1).hasOneUse() || RHS.getOperand(1).hasOneUse()) &&
----------------
RKSimon wrote:
We don't - its a bit of a mess! Let me see what happens when I remove the oneuse checks (some of this patch is very old).
https://github.com/llvm/llvm-project/pull/98466
    
    
        
	- Previous message: [llvm] [X86] Fold blend(pshufb(x,m1),pshufb(y,m2)) -> blend(pshufb(x,blend(m1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool (PR #98466)
- Next message: [llvm] [X86] Fold blend(pshufb(x,m1),pshufb(y,m2)) -> blend(pshufb(x,blend(m1,m2)),pshufb(y,blend(m1,m2))) to reduce constant pool (PR #98466)
-  Messages sorted by: 
              [ date ]
              [ thread ]
              [ subject ]
              [ author ]
         
More information about the llvm-commits
mailing list