Source code for sid.visualize_simulation_results

import shutil
from pathlib import Path

import dask.dataframe as dd
import numpy as np
import pandas as pd
from bokeh.io import export_png
from bokeh.io import output_file
from bokeh.models import Column
from bokeh.models import Div
from bokeh.plotting import figure
from bokeh.plotting import save
from sid.colors import get_colors
from sid.statistics import calculate_r_effective
from sid.statistics import calculate_r_zero


[docs]def visualize_simulation_results( data, outdir_path, infection_vars, background_vars, window_length=7, ): """Visualize the results one or more simulation results. Args: data (str, pandas.DataFrame, Path, list): list of paths to the pickled simulation results. outdir_path (path): path to the folder where to save the results. Careful, all contents are removed when the function is called. infection_vars (list): list of infection rates to plot background_vars (list): list of background variables by whose value to group the results. Have to be present in all simulation results. window_length (int): How many dates to use for the reproduction numbers. """ colors = get_colors("categorical", 12) if isinstance(background_vars, str): background_vars = [background_vars] outdir_path = Path(outdir_path) datasets = [data] if isinstance(data, (str, pd.DataFrame, Path)) else data datasets = [ Path(path_or_df) if isinstance(path_or_df, str) else path_or_df for path_or_df in datasets ] _create_folders(outdir_path, background_vars) rates = _create_rates_for_all_data( datasets, infection_vars, background_vars, window_length, ) for bg_var in ["general"] + background_vars: if bg_var == "general": title = "Rates in the General Population" else: title = f"Rates According to {_nice_str(bg_var)}" rate_plots = _create_rate_plots(rates[bg_var], colors, title) title_element = Div(text=title, style={"font-size": "150%"}) _export_plots_and_layout( title=title_element, plots=rate_plots, outdir_path=outdir_path / bg_var,
)
[docs]def _create_folders(outdir_path, background_vars): if outdir_path.exists(): shutil.rmtree(outdir_path) outdir_path.mkdir() for var in ["general"] + background_vars: outdir_path.joinpath(var).mkdir()
[docs]def _create_rates_for_all_data( datasets, infection_vars, background_vars, window_length ): """Create the statistics for each dataset and merge them into one dataset. Args: datasets (list): list of str, Paths to pickled DataFrames or pd.DataFrames. infection_vars (list): list of infection rates to plot background_vars (list): list of background variables by whose value to group the results. Have to be present in all simulation results. window_length (int): How many dates to use for the reproduction numbers. rates (pandas.DataFrame): DataFrame with the dates as index. The columns are a MultiIndex with four levels: The outermost is the "bg_var" ("general" for the overall rate). The next is the "rate" (e.g. the infectious rate or r zero), then "bg_value", the value of the background variable and last "data_id". """ name_to_statistics = {} for i, df_or_path in enumerate(datasets): vars_for_r_zero = ["immunity", "n_has_infected", "cd_infectious_false"] keep_vars = sorted( set(infection_vars + background_vars + vars_for_r_zero + ["date"]) ) df_name, df = _load_data(df_or_path, keep_vars, i) name_to_statistics[df_name] = _create_statistics( df=df, infection_vars=infection_vars, background_vars=background_vars, window_length=window_length, ) rates = pd.concat(name_to_statistics, axis=1, names=["data_id"]) order = ["bg_var", "rate", "bg_value", "data_id"] rates = rates.reorder_levels(order=order, axis=1) return rates
[docs]def _load_data(df_or_path, keep_vars, i): if isinstance(df_or_path, pd.DataFrame): df = df_or_path[keep_vars] df_name = i elif isinstance(df_or_path, Path): df = dd.read_parquet(df_or_path, engine="fastparquet")[keep_vars].compute() df_name = df_or_path.stem else: raise NotImplementedError return df_name, df
[docs]def _create_statistics(df, infection_vars, background_vars, window_length): """Calculate the infection rates and reproduction numbers for each date. Args: df (pandas.DataFrame): The simulation results. infection_vars (list): list of infection rates to plot background_vars (list): list of background variables by whose value to group the results. Have to be present in all simulation results. window_length (int): How many dates to use for the reproduction numbers. Returns: rates (pandas.DataFrame): DataFrame with the statistics of one simulation run. The index are the dates. The columns are a MultiIndex with three levels: The outermost is the "bg_var" ("general" for the overall rate). The next is the "bg_value", the last is the "rate" (e.g. the infectious rate or r zero). """ gb = df.groupby("date") overall = gb.mean()[infection_vars] overall["r_zero"] = gb.apply(calculate_r_zero, window_length) overall["r_effective"] = gb.apply(calculate_r_effective, window_length) # add column levels for later overall.columns.name = "rate" overall = _prepend_column_level(overall, "general", "bg_value") overall = _prepend_column_level(overall, "general", "bg_var") single_df_rates = [overall] for bg_var in background_vars: gb = df.groupby([bg_var, "date"]) infection_rates = gb.mean()[infection_vars].unstack(level=0) r_zeros = gb.apply(calculate_r_zero, window_length).unstack(level=0) r_zeros = _prepend_column_level(r_zeros, "r_zero", "rate") r_eff = gb.apply(calculate_r_effective, window_length).unstack(level=0) r_eff = _prepend_column_level(r_eff, "r_effective", "rate") rates_by_group = pd.concat([infection_rates, r_zeros, r_eff], axis=1) rates_by_group.columns.names = ["rate", "bg_value"] rates_by_group = _prepend_column_level(rates_by_group, bg_var, "bg_var") rates_by_group = rates_by_group.swaplevel("rate", "bg_value", axis=1) single_df_rates.append(rates_by_group) rates = pd.concat(single_df_rates, axis=1).fillna(0) return rates
[docs]def _prepend_column_level(df, key, name): prepended = pd.concat([df], keys=[key], names=[name], axis=1) return prepended
[docs]def _create_rate_plots(rates, colors, title): """Plot all rates for a single background variable Args: rates (pandas.DataFrame): DataFrame with the dates as index. The columns are a MultiIndex with three levels: The outermost is the variable name (e.g. infectious or r_zero). The next are the values the background variable can take, the last "data_id". colors (list): list of colors to use. title (str): the plot title will be the name of the rate plus this string. Returns: plots (list): list of bokeh plots. """ vars_to_plot = rates.columns.levels[0] plots = [] full_range_vars = [ "ever_infected", "immunity", "symptomatic_among_infectious", ] for var, color in zip(vars_to_plot, colors): y_range = (0, 1) if var in full_range_vars else None bg_values = rates[var].columns.unique().levels[0] for bg_val in bg_values: plot_title = f"{_nice_str(var)} {title}" if bg_val != "general": plot_title += f": {bg_val}" p = _plot_rates( rates=rates[var][bg_val], title=plot_title, color=color, y_range=y_range, ) p.name = var if bg_val == "general" else f"{var}_{bg_val.replace(' ', '')}" plots.append(p) return plots
[docs]def _plot_rates(rates, title, color, y_range): """Plot the rates over time. Args: rates (DataFrame): the index are the x values, the values the y values. Every column is plotted as a separate line. color (str): color. title (str): plot title. y_range (tuple or None): range of the y axis. Returns: p (bokeh figure) """ xs = rates.index p = figure( tools=[], plot_height=400, plot_width=800, title=title, y_range=y_range, x_axis_type="datetime", ) # plot the median p.line(x=xs, y=rates.median(axis=1), alpha=1, line_width=2.75, line_color=color) # plot the confidence band q5 = rates.apply(np.nanpercentile, q=5, axis=1) q95 = rates.apply(np.nanpercentile, q=95, axis=1) p.varea(x=xs, y1=q95, y2=q5, alpha=0.2, color=color) # add the trajectories for var in rates: p.line(x=xs, y=rates[var], line_width=1, line_color=color, alpha=0.3) p = _style(p) return p
[docs]def _export_plots_and_layout(title, plots, outdir_path): """Save all plots as png and the layout as html. Args: title (bokeh.Div): title element. plots (list): list of bokeh plots outdir_path (pathlib.Path): base path to which to append the plot name to build the path where to save each plot. """ for p in plots: outpath = outdir_path / f"{p.name}.png" output_file(outpath) export_png(p, filename=outpath) output_file(outdir_path / "overview.html") save(Column(title, *plots))
[docs]def _style(p): gray = "#808080" p.outline_line_color = None p.xgrid.visible = False p.ygrid.visible = False p.axis.minor_tick_line_color = None p.axis.axis_line_color = gray p.axis.major_label_text_color = gray p.axis.major_tick_line_color = gray return p
[docs]def _nice_str(s): return s.replace("_", " ").title()