[llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for bitfield extract (PR #132381)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 27 07:49:18 PDT 2025


================
@@ -127,6 +131,117 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
   MI.eraseFromParent();
 }
 
+bool isSignedBFE(MachineInstr &MI) {
+  unsigned Opc =
+      isa<GIntrinsic>(MI) ? MI.getOperand(1).getIntrinsicID() : MI.getOpcode();
+
+  switch (Opc) {
+  case AMDGPU::G_SBFX:
+  case Intrinsic::amdgcn_sbfe:
+    return true;
+  case AMDGPU::G_UBFX:
+  case Intrinsic::amdgcn_ubfe:
+    return false;
+  default:
+    llvm_unreachable("Opcode not supported");
+  }
+}
+
+void RegBankLegalizeHelper::lowerDiv_BFE(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  assert(MRI.getType(Dst) == LLT::scalar(64));
+  bool Signed = isSignedBFE(MI);
+  unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
+  // Extract bitfield from Src, LSBit is the least-significant bit for the
+  // extraction (field offset) and Width is size of bitfield.
+  Register Src = MI.getOperand(FirstOpnd).getReg();
+  Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
+  Register Width = MI.getOperand(FirstOpnd + 2).getReg();
+  // Comments are for signed bitfield extract, similar for unsigned. x is sign
+  // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
+
+  // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
+  unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
+  auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
+
+  auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
+
+  // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
+  // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
+  // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
+  if (!ConstWidth) {
+    auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
+    auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
+    B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
+    MI.eraseFromParent();
+    return;
+  }
+
+  auto WidthImm = ConstWidth->Value.getZExtValue();
+  auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
+  Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
+  Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
+  auto Zero = B.buildConstant({VgprRB, S32}, 0);
+  unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
+
+  if (WidthImm <= 32) {
+    // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
+    auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
+    MachineInstrBuilder Hi;
+    if (Signed) {
+      // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
+      Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
+    } else {
+      // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
+      Hi = Zero;
+    }
+    B.buildMergeLikeInstr(Dst, {Lo, Hi});
+  } else {
+    auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
+    // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
+    auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
+    B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
+  }
+
+  MI.eraseFromParent();
+  return;
+}
+
+void RegBankLegalizeHelper::lowerUni_BFE(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  bool Signed = isSignedBFE(MI);
+  unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
+  Register Src = MI.getOperand(FirstOpnd).getReg();
+  Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
+  Register Width = MI.getOperand(FirstOpnd + 2).getReg();
+  // For uniform bit field extract there are 4 available instructions, but
+  // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
+  // field offset in low and size in high 16 bits.
+
+  // Src1 Hi16|Lo16 = Size|FieldOffset
+  auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
+  auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
+  auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
+  auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
+  unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
+  unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
----------------
arsenm wrote:

I'm worried about this harming the ability of downstream transforms and analyses to do anything with the operation. We should still be able to handle known and demanded bits. Also don't want to break any patterns that read BFE inputs. Except for the intrinsic with variable inputs case, can't you just use the G_* opcodes? 

https://github.com/llvm/llvm-project/pull/132381


More information about the llvm-commits mailing list