From 8efe8690ce404c35aa06b4f80847bb0a534188c7 Mon Sep 17 00:00:00 2001 From: Martijn Courteaux Date: Tue, 3 Dec 2024 09:31:52 +0100 Subject: [PATCH] Feedback from Steven. --- src/ApproximationTables.cpp | 36 +++++++++++++++++++++++++------- src/ApproximationTables.h | 5 ++++- src/IROperator.cpp | 12 +---------- src/IROperator.h | 13 ++++++------ test/performance/fast_arctan.cpp | 2 +- 5 files changed, 41 insertions(+), 27 deletions(-) diff --git a/src/ApproximationTables.cpp b/src/ApproximationTables.cpp index a3af6dfaacd1..1a68d441b0ef 100644 --- a/src/ApproximationTables.cpp +++ b/src/ApproximationTables.cpp @@ -7,9 +7,17 @@ namespace { using OO = ApproximationPrecision::OptimizationObjective; +// clang-format off // Generate this table with: // python3 src/polynomial_optimizer.py atan --order 1 2 3 4 5 6 7 8 --loss mse mae mulpe mulpe_mae --no-gui --format table -std::vector table_atan = { +// +// Note that the maximal errors are computed with numpy with double precision. +// The real errors are a bit larger with single-precision floats (see correctness/fast_arctan.cpp). +// Also note that ULP distances which are not units are bogus, but this is because this error +// was again measured with double precision, so the actual reconstruction had more bits of +// precision than the actual float32 target value. So in practice the MaxULP Error +// will be close to round(MaxUlpE). +const std::vector table_atan = { {OO::MSE, 9.249650e-04, 7.078984e-02, 2.411e+06, {+8.56188008e-01}}, {OO::MSE, 1.026356e-05, 9.214909e-03, 3.985e+05, {+9.76213454e-01, -2.00030200e-01}}, {OO::MSE, 1.577588e-07, 1.323851e-03, 6.724e+04, {+9.95982073e-01, -2.92278128e-01, +8.30180680e-02}}, @@ -46,21 +54,28 @@ std::vector table_atan = { {OO::MULPE_MAE, 3.053218e-14, 3.784868e-07, 4.181e+01, {+9.99997480e-01, -3.33205127e-01, +1.98309644e-01, -1.33094430e-01, +8.08643094e-02, -3.45859503e-02, +7.11261604e-03}}, {OO::MULPE_MAE, 7.018877e-16, 5.862915e-08, 6.942e+00, {+9.99999581e-01, -3.33306326e-01, +1.99542180e-01, -1.39433369e-01, +9.72462857e-02, -5.69734398e-02, +2.25639390e-02, -4.24074590e-03}}, }; +// clang-format on } // namespace -const Approximation *find_best_approximation(const std::vector &table, ApproximationPrecision precision) { +const Approximation *find_best_approximation(const std::vector &table, + ApproximationPrecision precision) { +#define DEBUG_APPROXIMATION_SEARCH 0 const Approximation *best = nullptr; constexpr int term_cost = 20; constexpr int extra_term_cost = 200; double best_score = 0; - // std::printf("Looking for min_terms=%d, max_absolute_error=%f\n", precision.constraint_min_poly_terms, precision.constraint_max_absolute_error); +#if DEBUG_APPROXIMATION_SEARCH + std::printf("Looking for min_terms=%d, max_absolute_error=%f\n", + precision.constraint_min_poly_terms, precision.constraint_max_absolute_error); +#endif for (size_t i = 0; i < table.size(); ++i) { const Approximation &e = table[i]; double penalty = 0.0; int obj_score = e.objective == precision.optimized_for ? 100 * term_cost : 0; - if (precision.optimized_for == ApproximationPrecision::MULPE_MAE && e.objective == ApproximationPrecision::MULPE) { + if (precision.optimized_for == ApproximationPrecision::MULPE_MAE && + e.objective == ApproximationPrecision::MULPE) { obj_score = 50 * term_cost; // When MULPE_MAE is not available, prefer MULPE. } @@ -87,19 +102,26 @@ const Approximation *find_best_approximation(const std::vector &t break; } - if (precision.constraint_max_absolute_error > 0.0 && precision.constraint_max_absolute_error < e.mae) { + if (precision.constraint_max_absolute_error > 0.0 && + precision.constraint_max_absolute_error < e.mae) { float error_ratio = e.mae / precision.constraint_max_absolute_error; penalty += 20 * error_ratio * extra_term_cost; // penalty for not getting the required precision. } double score = obj_score + term_count_score + precision_score - penalty; - // std::printf("Score for %zu (%zu terms): %f = %d + %d + %f - penalty %f\n", i, e.coefficients.size(), score, obj_score, term_count_score, precision_score, penalty); +#if DEBUG_APPROXIMATION_SEARCH + std::printf("Score for %zu (%zu terms): %f = %d + %d + %f - penalty %f\n", + i, e.coefficients.size(), score, obj_score, term_count_score, + precision_score, penalty); +#endif if (score > best_score || best == nullptr) { best = &e; best_score = score; } } - // std::printf("Best score: %f\n", best_score); +#if DEBUG_APPROXIMATION_SEARCH + std::printf("Best score: %f\n", best_score); +#endif return best; } diff --git a/src/ApproximationTables.h b/src/ApproximationTables.h index ddf38ca9bf41..3af680a2e08d 100644 --- a/src/ApproximationTables.h +++ b/src/ApproximationTables.h @@ -1,4 +1,5 @@ -#pragma once +#ifndef HALIDE_APPROXIMATION_TABLES_H +#define HALIDE_APPROXIMATION_TABLES_H #include @@ -19,3 +20,5 @@ const Approximation *best_atan_approximation(Halide::ApproximationPrecision prec } // namespace Internal } // namespace Halide + +#endif diff --git a/src/IROperator.cpp b/src/IROperator.cpp index 2f1c5492e2b0..94f5c9d268ae 100644 --- a/src/IROperator.cpp +++ b/src/IROperator.cpp @@ -1438,19 +1438,8 @@ Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precisio } else { x = select(x_gt_1, 1.0f / x_full, x_full); } - - // Coefficients obtained using src/polynomial_optimizer.py - // Note that the maximal errors are computed with numpy with double precision. - // The real errors are a bit larger with single-precision floats (see correctness/fast_arctan.cpp). - // Also note that ULP distances which are not units are bogus, but this is because this error - // was again measured with double precision, so the actual reconstruction had more bits of precision - // than the actual float32 target value. So in practice the MaxULP Error will be close to round(MaxUlpE). - - // The table is huge, so let's put clang-format off and handle the layout manually: - // clang-format off const Internal::Approximation *approx = Internal::best_atan_approximation(precision); const std::vector &c = approx->coefficients; - Expr x2 = x * x; Expr result = float(c.back()); for (size_t i = 1; i < c.size(); ++i) { @@ -1463,6 +1452,7 @@ Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precisio } return common_subexpression_elimination(result); } + Expr fast_atan(const Expr &x_full, ApproximationPrecision precision) { return fast_atan_approximation(x_full, precision, false); } diff --git a/src/IROperator.h b/src/IROperator.h index 4f120a9fbac5..57c41a6e4d47 100644 --- a/src/IROperator.h +++ b/src/IROperator.h @@ -972,8 +972,7 @@ Expr fast_sin(const Expr &x); Expr fast_cos(const Expr &x); // @} -/** - * Struct that allows the user to specify several requirements for functions +/** Struct that allows the user to specify several requirements for functions * that are approximated by polynomial expansions. These polynomials can be * optimized for four different metrics: Mean Squared Error, Maximum Absolute Error, * Maximum Units in Last Place (ULP) Error, or a 50%/50% blend of MAE and MULPE. @@ -981,9 +980,9 @@ Expr fast_cos(const Expr &x); * Orthogonally to the optimization objective, these polynomials can vary * in degree. Higher degree polynomials will give more precise results. * Note that instead of specifying the degree, the number of terms is used instead. - * E.g., even symmetric functions may be implemented using only even powers, for which - * A number of terms of 4 would actually mean that terms in [1, x^2, x^4, x^6] are used, - * which is degree 6. + * E.g., even (i.e., symmetric) functions may be implemented using only even powers, + * for which a number of terms of 4 would actually mean that terms + * in [1, x^2, x^4, x^6] are used, which is degree 6. * * Additionally, if you don't care about number of terms in the polynomial * and you do care about the maximal absolute error the approximation may have @@ -1014,8 +1013,8 @@ struct ApproximationPrecision { * For more info on the available approximations and their precisions, see the table in ApproximationTables.cpp. * * Note: the polynomial uses odd powers, so the number of terms is not the degree of the polynomial. - * Note: Poly8 is only useful to increase precision for atan, and not for atan2. - * Note: The performance of this functions seem to be not reliably faster on WebGPU (for now, August 2024). + * Note: the polynomial with 8 terms is only useful to increase precision for fast_atan, and not for fast_atan2. + * Note: the performance of this functions seem to be not reliably faster on WebGPU (for now, August 2024). */ // @{ Expr fast_atan(const Expr &x, ApproximationPrecision precision = {ApproximationPrecision::MULPE, 6}); diff --git a/test/performance/fast_arctan.cpp b/test/performance/fast_arctan.cpp index 74e7b8092762..680e24ff7f66 100644 --- a/test/performance/fast_arctan.cpp +++ b/test/performance/fast_arctan.cpp @@ -26,7 +26,7 @@ int main(int argc, char **argv) { Expr t0 = x / float(test_w); Expr t1 = y / float(test_h); - // To make sure we time mostely the computation of the arctan, and not memory bandwidth, + // To make sure we time mostly the computation of the arctan, and not memory bandwidth, // we will compute many arctans per output and sum them. In my testing, GPUs suffer more // from bandwith with this test, so we give it more arctangents to compute per output. const int test_d = target.has_gpu_feature() ? 1024 : 64;