Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculate the five categories for each state #35

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions cfa_rt_postprocessing/main_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,16 @@ def merge_and_render_anomaly(
except Exception as e:
console.log(f"Failed to upload the flu anomaly report: {e}")

# === Calculate the categories for the samples =====================================
console.status("Calculating the categories from the samples")
p_growing = calculate_categories(final_samples)

# Save it to file as parquet, and as CSV
p_growing_pq_file = internal_review / "p_growing.parquet"
p_growing_csv_file = internal_review / "p_growing.csv"
p_growing.write_parquet(p_growing_pq_file)
p_growing.write_csv(p_growing_csv_file)

# === Clean up =====================================================================
conn.close()
console.log(f"Cleaning up {root} folder")
Expand Down Expand Up @@ -437,6 +447,55 @@ def render_report(
)


def calculate_categories(samples_file: Path) -> pl.DataFrame:
"""
Takes in the path to the samples parquet file, calculates the five categories for
each geo_value, disease, and reference_date.

Returns a DataFrame with the columns:
- geo_value
- disease
- reference_date
- p_growing
- category

The samples file is fairly large, so use duckdb to help things go faster and more
efficiently.
"""
conn = duckdb.connect()
conn.sql(
f"""
-- First create a 'view' of the samples. Don't use a table, because that would read
-- it all into RAM. ATM, a single samples file is about 1.7GB, but attempting to load
-- it into RAM crashes my 32GB machine.
CREATE OR REPLACE VIEW samples AS
SELECT reference_date, geo_value, disease, "value" as Rt
FROM '{str(samples_file.absolute())}'
WHERE "_variable" = 'Rt';

-- Calculate the p_growing for each geo_value, disease, and reference_date
CREATE OR REPLACE TABLE p_growing AS SELECT
geo_value, disease, reference_date,
AVG(IF(Rt > 1, 1, 0)) AS p_growing,
CASE
WHEN (p_growing > 0.9) AND (p_growing <= 1.0) THEN 'Growing'
WHEN (p_growing > 0.75) AND (p_growing <= 0.9) THEN 'Likely Growing'
WHEN (p_growing > 0.25) AND (p_growing <= 0.75) THEN 'Not Changing'
WHEN (p_growing > 0.10) AND (p_growing <= 0.25) THEN 'Likely Declining'
WHEN (p_growing >= 0.0) AND (p_growing <= 0.10) THEN 'Declining'
END AS five_cat_p_growing
FROM samples
GROUP BY ALL
ORDER BY ALL;
"""
)

p_growing = conn.sql("SELECT * FROM p_growing").pl()
conn.close()

return p_growing


if __name__ == "__main__":
# Some sample inputs for testing. Need to move something like this to an actual test
args = {
Expand Down
Loading