From 8efe8690ce404c35aa06b4f80847bb0a534188c7 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Tue, 3 Dec 2024 09:31:52 +0100
Subject: [PATCH] Feedback from Steven.

---
 src/ApproximationTables.cpp      | 36 +++++++++++++++++++++++++-------
 src/ApproximationTables.h        |  5 ++++-
 src/IROperator.cpp               | 12 +----------
 src/IROperator.h                 | 13 ++++++------
 test/performance/fast_arctan.cpp |  2 +-
 5 files changed, 41 insertions(+), 27 deletions(-)
diff --git a/src/ApproximationTables.cpp b/src/ApproximationTables.cpp
index a3af6dfaacd1..1a68d441b0ef 100644
--- a/src/ApproximationTables.cpp
+++ b/src/ApproximationTables.cpp
@@ -7,9 +7,17 @@ namespace {
 
 using OO = ApproximationPrecision::OptimizationObjective;
 
+// clang-format off
 // Generate this table with:
 //   python3 src/polynomial_optimizer.py atan --order 1 2 3 4 5 6 7 8 --loss mse mae mulpe mulpe_mae --no-gui --format table
-std::vector<Approximation> table_atan = {
+//
+// Note that the maximal errors are computed with numpy with double precision.
+// The real errors are a bit larger with single-precision floats (see correctness/fast_arctan.cpp).
+// Also note that ULP distances which are not units are bogus, but this is because this error
+// was again measured with double precision, so the actual reconstruction had more bits of
+// precision than the actual float32 target value. So in practice the MaxULP Error
+// will be close to round(MaxUlpE).
+const std::vector<Approximation> table_atan = {
     {OO::MSE, 9.249650e-04, 7.078984e-02, 2.411e+06, {+8.56188008e-01}},
     {OO::MSE, 1.026356e-05, 9.214909e-03, 3.985e+05, {+9.76213454e-01, -2.00030200e-01}},
     {OO::MSE, 1.577588e-07, 1.323851e-03, 6.724e+04, {+9.95982073e-01, -2.92278128e-01, +8.30180680e-02}},
@@ -46,21 +54,28 @@ std::vector<Approximation> table_atan = {
     {OO::MULPE_MAE, 3.053218e-14, 3.784868e-07, 4.181e+01, {+9.99997480e-01, -3.33205127e-01, +1.98309644e-01, -1.33094430e-01, +8.08643094e-02, -3.45859503e-02, +7.11261604e-03}},
     {OO::MULPE_MAE, 7.018877e-16, 5.862915e-08, 6.942e+00, {+9.99999581e-01, -3.33306326e-01, +1.99542180e-01, -1.39433369e-01, +9.72462857e-02, -5.69734398e-02, +2.25639390e-02, -4.24074590e-03}},
 };
+// clang-format on
 }  // namespace
 
-const Approximation *find_best_approximation(const std::vector<Approximation> &table, ApproximationPrecision precision) {
+const Approximation *find_best_approximation(const std::vector<Approximation> &table,
+                                             ApproximationPrecision precision) {
+#define DEBUG_APPROXIMATION_SEARCH 0
     const Approximation *best = nullptr;
     constexpr int term_cost = 20;
     constexpr int extra_term_cost = 200;
     double best_score = 0;
-    // std::printf("Looking for min_terms=%d, max_absolute_error=%f\n", precision.constraint_min_poly_terms, precision.constraint_max_absolute_error);
+#if DEBUG_APPROXIMATION_SEARCH
+    std::printf("Looking for min_terms=%d, max_absolute_error=%f\n",
+                precision.constraint_min_poly_terms, precision.constraint_max_absolute_error);
+#endif
     for (size_t i = 0; i < table.size(); ++i) {
         const Approximation &e = table[i];
 
         double penalty = 0.0;
 
         int obj_score = e.objective == precision.optimized_for ? 100 * term_cost : 0;
-        if (precision.optimized_for == ApproximationPrecision::MULPE_MAE && e.objective == ApproximationPrecision::MULPE) {
+        if (precision.optimized_for == ApproximationPrecision::MULPE_MAE &&
+            e.objective == ApproximationPrecision::MULPE) {
             obj_score = 50 * term_cost;  // When MULPE_MAE is not available, prefer MULPE.
         }
 
@@ -87,19 +102,26 @@ const Approximation *find_best_approximation(const std::vector<Approximation> &t
             break;
         }
 
-        if (precision.constraint_max_absolute_error > 0.0 && precision.constraint_max_absolute_error < e.mae) {
+        if (precision.constraint_max_absolute_error > 0.0 &&
+            precision.constraint_max_absolute_error < e.mae) {
             float error_ratio = e.mae / precision.constraint_max_absolute_error;
             penalty += 20 * error_ratio * extra_term_cost;  // penalty for not getting the required precision.
         }
 
         double score = obj_score + term_count_score + precision_score - penalty;
-        // std::printf("Score for %zu (%zu terms): %f = %d + %d + %f - penalty %f\n", i, e.coefficients.size(), score, obj_score, term_count_score, precision_score, penalty);
+#if DEBUG_APPROXIMATION_SEARCH
+        std::printf("Score for %zu (%zu terms): %f = %d + %d + %f - penalty %f\n",
+                    i, e.coefficients.size(), score, obj_score, term_count_score,
+                    precision_score, penalty);
+#endif
         if (score > best_score || best == nullptr) {
             best = &e;
             best_score = score;
         }
     }
-    // std::printf("Best score: %f\n", best_score);
+#if DEBUG_APPROXIMATION_SEARCH
+    std::printf("Best score: %f\n", best_score);
+#endif
     return best;
 }
 
diff --git a/src/ApproximationTables.h b/src/ApproximationTables.h
index ddf38ca9bf41..3af680a2e08d 100644
--- a/src/ApproximationTables.h
+++ b/src/ApproximationTables.h
@@ -1,4 +1,5 @@
-#pragma once
+#ifndef HALIDE_APPROXIMATION_TABLES_H
+#define HALIDE_APPROXIMATION_TABLES_H
 
 #include <vector>
 
@@ -19,3 +20,5 @@ const Approximation *best_atan_approximation(Halide::ApproximationPrecision prec
 
 }  // namespace Internal
 }  // namespace Halide
+
+#endif
diff --git a/src/IROperator.cpp b/src/IROperator.cpp
index 2f1c5492e2b0..94f5c9d268ae 100644
--- a/src/IROperator.cpp
+++ b/src/IROperator.cpp
@@ -1438,19 +1438,8 @@ Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precisio
     } else {
         x = select(x_gt_1, 1.0f / x_full, x_full);
     }
-
-    // Coefficients obtained using src/polynomial_optimizer.py
-    // Note that the maximal errors are computed with numpy with double precision.
-    // The real errors are a bit larger with single-precision floats (see correctness/fast_arctan.cpp).
-    // Also note that ULP distances which are not units are bogus, but this is because this error
-    // was again measured with double precision, so the actual reconstruction had more bits of precision
-    // than the actual float32 target value. So in practice the MaxULP Error will be close to round(MaxUlpE).
-
-    // The table is huge, so let's put clang-format off and handle the layout manually:
-    // clang-format off
     const Internal::Approximation *approx = Internal::best_atan_approximation(precision);
     const std::vector<double> &c = approx->coefficients;
-
     Expr x2 = x * x;
     Expr result = float(c.back());
     for (size_t i = 1; i < c.size(); ++i) {
@@ -1463,6 +1452,7 @@ Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precisio
     }
     return common_subexpression_elimination(result);
 }
+
 Expr fast_atan(const Expr &x_full, ApproximationPrecision precision) {
     return fast_atan_approximation(x_full, precision, false);
 }
diff --git a/src/IROperator.h b/src/IROperator.h
index 4f120a9fbac5..57c41a6e4d47 100644
--- a/src/IROperator.h
+++ b/src/IROperator.h
@@ -972,8 +972,7 @@ Expr fast_sin(const Expr &x);
 Expr fast_cos(const Expr &x);
 // @}
 
-/**
- * Struct that allows the user to specify several requirements for functions
+/** Struct that allows the user to specify several requirements for functions
  * that are approximated by polynomial expansions. These polynomials can be
  * optimized for four different metrics: Mean Squared Error, Maximum Absolute Error,
  * Maximum Units in Last Place (ULP) Error, or a 50%/50% blend of MAE and MULPE.
@@ -981,9 +980,9 @@ Expr fast_cos(const Expr &x);
  * Orthogonally to the optimization objective, these polynomials can vary
  * in degree. Higher degree polynomials will give more precise results.
  * Note that instead of specifying the degree, the number of terms is used instead.
- * E.g., even symmetric functions may be implemented using only even powers, for which
- * A number of terms of 4 would actually mean that terms in [1, x^2, x^4, x^6] are used,
- * which is degree 6.
+ * E.g., even (i.e., symmetric) functions may be implemented using only even powers,
+ * for which a number of terms of 4 would actually mean that terms
+ * in [1, x^2, x^4, x^6] are used, which is degree 6.
  *
  * Additionally, if you don't care about number of terms in the polynomial
  * and you do care about the maximal absolute error the approximation may have
@@ -1014,8 +1013,8 @@ struct ApproximationPrecision {
  * For more info on the available approximations and their precisions, see the table in ApproximationTables.cpp.
  *
  * Note: the polynomial uses odd powers, so the number of terms is not the degree of the polynomial.
- * Note: Poly8 is only useful to increase precision for atan, and not for atan2.
- * Note: The performance of this functions seem to be not reliably faster on WebGPU (for now, August 2024).
+ * Note: the polynomial with 8 terms is only useful to increase precision for fast_atan, and not for fast_atan2.
+ * Note: the performance of this functions seem to be not reliably faster on WebGPU (for now, August 2024).
  */
 // @{
 Expr fast_atan(const Expr &x, ApproximationPrecision precision = {ApproximationPrecision::MULPE, 6});
diff --git a/test/performance/fast_arctan.cpp b/test/performance/fast_arctan.cpp
index 74e7b8092762..680e24ff7f66 100644
--- a/test/performance/fast_arctan.cpp
+++ b/test/performance/fast_arctan.cpp
@@ -26,7 +26,7 @@ int main(int argc, char **argv) {
 
     Expr t0 = x / float(test_w);
     Expr t1 = y / float(test_h);
-    // To make sure we time mostely the computation of the arctan, and not memory bandwidth,
+    // To make sure we time mostly the computation of the arctan, and not memory bandwidth,
     // we will compute many arctans per output and sum them. In my testing, GPUs suffer more
     // from bandwith with this test, so we give it more arctangents to compute per output.
     const int test_d = target.has_gpu_feature() ? 1024 : 64;