[llvm] AMDGPU: Implement llvm.set.rounding (PR #88587)

Thu Apr 18 06:55:03 PDT 2024

================
@@ -4056,6 +4057,75 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
   return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
 }
 
+SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+
+  SDValue NewMode = Op.getOperand(1);
+  assert(NewMode.getValueType() == MVT::i32);
+
+  // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
+  // hardware MODE.fp_round values.
+  if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
+    uint32_t ClampedVal = std::min(
+        static_cast<uint32_t>(ConstMode->getZExtValue()),
+        static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
+    NewMode = DAG.getConstant(
+        AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
+  } else {
+    SDValue BitTable =
+        DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
+
+    // The supported standard values are 0-3. The extended values start at 8. We
+    // need to offset by 4 if the value is in the extended range.
+
+    // is_standard = value < 4;
+    // table_index = is_standard ? value : (value - 4)
+    // MODE.fp_round = (bit_table >> table_index) & 0xf
----------------
jayfoad wrote:

```suggestion
    // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
```

https://github.com/llvm/llvm-project/pull/88587