[llvm] [AMDGPU] Eliminate unnecessary packing in wider f16 vectors for sdwa/opsel-able instruction (PR #137137)

Mon Dec 8 22:58:20 PST 2025

================
@@ -1369,6 +1457,434 @@ bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
   return SIPeepholeSDWA().run(MF);
 }
 
+static bool isSrcDestFP16Bits(MachineInstr *MI, const SIInstrInfo *TII) {
+  static const DenseSet<unsigned> FP16BitOpcodes = {
+      AMDGPU::V_CVT_F16_U16_e32,       AMDGPU::V_CVT_F16_U16_e64,
+      AMDGPU::V_CVT_F16_I16_e32,       AMDGPU::V_CVT_F16_I16_e64,
+      AMDGPU::V_RCP_F16_e64,           AMDGPU::V_RCP_F16_e32,
+      AMDGPU::V_RSQ_F16_e64,           AMDGPU::V_RSQ_F16_e32,
+      AMDGPU::V_SQRT_F16_e64,          AMDGPU::V_SQRT_F16_e32,
+      AMDGPU::V_LOG_F16_e64,           AMDGPU::V_LOG_F16_e32,
+      AMDGPU::V_EXP_F16_e64,           AMDGPU::V_EXP_F16_e32,
+      AMDGPU::V_SIN_F16_e64,           AMDGPU::V_SIN_F16_e32,
+      AMDGPU::V_COS_F16_e64,           AMDGPU::V_COS_F16_e32,
+      AMDGPU::V_FLOOR_F16_e64,         AMDGPU::V_FLOOR_F16_e32,
+      AMDGPU::V_CEIL_F16_e64,          AMDGPU::V_CEIL_F16_e32,
+      AMDGPU::V_TRUNC_F16_e64,         AMDGPU::V_TRUNC_F16_e32,
+      AMDGPU::V_RNDNE_F16_e64,         AMDGPU::V_RNDNE_F16_e32,
+      AMDGPU::V_FRACT_F16_e64,         AMDGPU::V_FRACT_F16_e32,
+      AMDGPU::V_FREXP_MANT_F16_e64,    AMDGPU::V_FREXP_MANT_F16_e32,
+      AMDGPU::V_FREXP_EXP_I16_F16_e64, AMDGPU::V_FREXP_EXP_I16_F16_e32,
+      AMDGPU::V_LDEXP_F16_e64,         AMDGPU::V_LDEXP_F16_e32,
+      AMDGPU::V_ADD_F16_e64,           AMDGPU::V_ADD_F16_e32,
+      AMDGPU::V_SUB_F16_e64,           AMDGPU::V_SUB_F16_e32,
+      AMDGPU::V_SUBREV_F16_e64,        AMDGPU::V_SUBREV_F16_e32,
+      AMDGPU::V_MUL_F16_e64,           AMDGPU::V_MUL_F16_e32,
+      AMDGPU::V_MAX_F16_e64,           AMDGPU::V_MAX_F16_e32,
+      AMDGPU::V_MIN_F16_e64,           AMDGPU::V_MIN_F16_e32,
+      AMDGPU::V_MAD_F16_e64,           AMDGPU::V_FMA_F16_e64,
+      AMDGPU::V_DIV_FIXUP_F16_e64};
+
+  unsigned Opcode = MI->getOpcode();
+  if (TII->isSDWA(Opcode))
+    Opcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+
+  return FP16BitOpcodes.contains(Opcode);
----------------
vg0204 wrote:

> Broader question: are there other opcodes that use OPSEL or SDWA that are eligible for this optemization? (I'd suspect there's some BF16 ones, say)

As you now mentioned, I attempted to generate the list for such opcodes using cursor as I attached here 
[AMDGPU_16bit_Opcodes.md](https://github.com/user-attachments/files/24048736/AMDGPU_16bit_Opcodes.md) which seems quite apt to me to be used!


https://github.com/llvm/llvm-project/pull/137137