Skip to content

Commit

Permalink
Feedback from Steven.
Browse files Browse the repository at this point in the history
  • Loading branch information
mcourteaux committed Dec 3, 2024
1 parent 9408c93 commit 8efe869
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 27 deletions.
36 changes: 29 additions & 7 deletions src/ApproximationTables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,17 @@ namespace {

using OO = ApproximationPrecision::OptimizationObjective;

// clang-format off
// Generate this table with:
// python3 src/polynomial_optimizer.py atan --order 1 2 3 4 5 6 7 8 --loss mse mae mulpe mulpe_mae --no-gui --format table
std::vector<Approximation> table_atan = {
//
// Note that the maximal errors are computed with numpy with double precision.
// The real errors are a bit larger with single-precision floats (see correctness/fast_arctan.cpp).
// Also note that ULP distances which are not units are bogus, but this is because this error
// was again measured with double precision, so the actual reconstruction had more bits of
// precision than the actual float32 target value. So in practice the MaxULP Error
// will be close to round(MaxUlpE).
const std::vector<Approximation> table_atan = {
{OO::MSE, 9.249650e-04, 7.078984e-02, 2.411e+06, {+8.56188008e-01}},
{OO::MSE, 1.026356e-05, 9.214909e-03, 3.985e+05, {+9.76213454e-01, -2.00030200e-01}},
{OO::MSE, 1.577588e-07, 1.323851e-03, 6.724e+04, {+9.95982073e-01, -2.92278128e-01, +8.30180680e-02}},
Expand Down Expand Up @@ -46,21 +54,28 @@ std::vector<Approximation> table_atan = {
{OO::MULPE_MAE, 3.053218e-14, 3.784868e-07, 4.181e+01, {+9.99997480e-01, -3.33205127e-01, +1.98309644e-01, -1.33094430e-01, +8.08643094e-02, -3.45859503e-02, +7.11261604e-03}},
{OO::MULPE_MAE, 7.018877e-16, 5.862915e-08, 6.942e+00, {+9.99999581e-01, -3.33306326e-01, +1.99542180e-01, -1.39433369e-01, +9.72462857e-02, -5.69734398e-02, +2.25639390e-02, -4.24074590e-03}},
};
// clang-format on
} // namespace

const Approximation *find_best_approximation(const std::vector<Approximation> &table, ApproximationPrecision precision) {
const Approximation *find_best_approximation(const std::vector<Approximation> &table,
ApproximationPrecision precision) {
#define DEBUG_APPROXIMATION_SEARCH 0
const Approximation *best = nullptr;
constexpr int term_cost = 20;
constexpr int extra_term_cost = 200;
double best_score = 0;
// std::printf("Looking for min_terms=%d, max_absolute_error=%f\n", precision.constraint_min_poly_terms, precision.constraint_max_absolute_error);
#if DEBUG_APPROXIMATION_SEARCH
std::printf("Looking for min_terms=%d, max_absolute_error=%f\n",
precision.constraint_min_poly_terms, precision.constraint_max_absolute_error);
#endif
for (size_t i = 0; i < table.size(); ++i) {
const Approximation &e = table[i];

double penalty = 0.0;

int obj_score = e.objective == precision.optimized_for ? 100 * term_cost : 0;
if (precision.optimized_for == ApproximationPrecision::MULPE_MAE && e.objective == ApproximationPrecision::MULPE) {
if (precision.optimized_for == ApproximationPrecision::MULPE_MAE &&
e.objective == ApproximationPrecision::MULPE) {
obj_score = 50 * term_cost; // When MULPE_MAE is not available, prefer MULPE.
}

Expand All @@ -87,19 +102,26 @@ const Approximation *find_best_approximation(const std::vector<Approximation> &t
break;
}

if (precision.constraint_max_absolute_error > 0.0 && precision.constraint_max_absolute_error < e.mae) {
if (precision.constraint_max_absolute_error > 0.0 &&
precision.constraint_max_absolute_error < e.mae) {
float error_ratio = e.mae / precision.constraint_max_absolute_error;
penalty += 20 * error_ratio * extra_term_cost; // penalty for not getting the required precision.
}

double score = obj_score + term_count_score + precision_score - penalty;
// std::printf("Score for %zu (%zu terms): %f = %d + %d + %f - penalty %f\n", i, e.coefficients.size(), score, obj_score, term_count_score, precision_score, penalty);
#if DEBUG_APPROXIMATION_SEARCH
std::printf("Score for %zu (%zu terms): %f = %d + %d + %f - penalty %f\n",
i, e.coefficients.size(), score, obj_score, term_count_score,
precision_score, penalty);
#endif
if (score > best_score || best == nullptr) {
best = &e;
best_score = score;
}
}
// std::printf("Best score: %f\n", best_score);
#if DEBUG_APPROXIMATION_SEARCH
std::printf("Best score: %f\n", best_score);
#endif
return best;
}

Expand Down
5 changes: 4 additions & 1 deletion src/ApproximationTables.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#pragma once
#ifndef HALIDE_APPROXIMATION_TABLES_H
#define HALIDE_APPROXIMATION_TABLES_H

#include <vector>

Expand All @@ -19,3 +20,5 @@ const Approximation *best_atan_approximation(Halide::ApproximationPrecision prec

} // namespace Internal
} // namespace Halide

#endif
12 changes: 1 addition & 11 deletions src/IROperator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1438,19 +1438,8 @@ Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precisio
} else {
x = select(x_gt_1, 1.0f / x_full, x_full);
}

// Coefficients obtained using src/polynomial_optimizer.py
// Note that the maximal errors are computed with numpy with double precision.
// The real errors are a bit larger with single-precision floats (see correctness/fast_arctan.cpp).
// Also note that ULP distances which are not units are bogus, but this is because this error
// was again measured with double precision, so the actual reconstruction had more bits of precision
// than the actual float32 target value. So in practice the MaxULP Error will be close to round(MaxUlpE).

// The table is huge, so let's put clang-format off and handle the layout manually:
// clang-format off
const Internal::Approximation *approx = Internal::best_atan_approximation(precision);
const std::vector<double> &c = approx->coefficients;

Expr x2 = x * x;
Expr result = float(c.back());
for (size_t i = 1; i < c.size(); ++i) {
Expand All @@ -1463,6 +1452,7 @@ Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precisio
}
return common_subexpression_elimination(result);
}

Expr fast_atan(const Expr &x_full, ApproximationPrecision precision) {
return fast_atan_approximation(x_full, precision, false);
}
Expand Down
13 changes: 6 additions & 7 deletions src/IROperator.h
Original file line number Diff line number Diff line change
Expand Up @@ -972,18 +972,17 @@ Expr fast_sin(const Expr &x);
Expr fast_cos(const Expr &x);
// @}

/**
* Struct that allows the user to specify several requirements for functions
/** Struct that allows the user to specify several requirements for functions
* that are approximated by polynomial expansions. These polynomials can be
* optimized for four different metrics: Mean Squared Error, Maximum Absolute Error,
* Maximum Units in Last Place (ULP) Error, or a 50%/50% blend of MAE and MULPE.
*
* Orthogonally to the optimization objective, these polynomials can vary
* in degree. Higher degree polynomials will give more precise results.
* Note that instead of specifying the degree, the number of terms is used instead.
* E.g., even symmetric functions may be implemented using only even powers, for which
* A number of terms of 4 would actually mean that terms in [1, x^2, x^4, x^6] are used,
* which is degree 6.
* E.g., even (i.e., symmetric) functions may be implemented using only even powers,
* for which a number of terms of 4 would actually mean that terms
* in [1, x^2, x^4, x^6] are used, which is degree 6.
*
* Additionally, if you don't care about number of terms in the polynomial
* and you do care about the maximal absolute error the approximation may have
Expand Down Expand Up @@ -1014,8 +1013,8 @@ struct ApproximationPrecision {
* For more info on the available approximations and their precisions, see the table in ApproximationTables.cpp.
*
* Note: the polynomial uses odd powers, so the number of terms is not the degree of the polynomial.
* Note: Poly8 is only useful to increase precision for atan, and not for atan2.
* Note: The performance of this functions seem to be not reliably faster on WebGPU (for now, August 2024).
* Note: the polynomial with 8 terms is only useful to increase precision for fast_atan, and not for fast_atan2.
* Note: the performance of this functions seem to be not reliably faster on WebGPU (for now, August 2024).
*/
// @{
Expr fast_atan(const Expr &x, ApproximationPrecision precision = {ApproximationPrecision::MULPE, 6});
Expand Down
2 changes: 1 addition & 1 deletion test/performance/fast_arctan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ int main(int argc, char **argv) {

Expr t0 = x / float(test_w);
Expr t1 = y / float(test_h);
// To make sure we time mostely the computation of the arctan, and not memory bandwidth,
// To make sure we time mostly the computation of the arctan, and not memory bandwidth,
// we will compute many arctans per output and sum them. In my testing, GPUs suffer more
// from bandwith with this test, so we give it more arctangents to compute per output.
const int test_d = target.has_gpu_feature() ? 1024 : 64;
Expand Down

0 comments on commit 8efe869

Please sign in to comment.