import numpy as np
import pandas as pd
import re
from .statistics import li_ma_significance
default_theta_off_keys = tuple('Theta_Off_{}_deg'.format(i) for i in range(1, 6))
default_prediction_off_keys = tuple(
'background_prediction_{}'.format(i) for i in range(1, 6)
)
off_key_re = re.compile('([a-zA-z1-9]+)_Off_([0-9])')
[docs]def calc_run_summary_source_independent(
events, runs,
prediction_threshold,
theta2_cut,
prediction_key='signal_prediction',
theta_key='Theta_deg',
theta_off_keys=default_theta_off_keys,
):
'''
Calculate run summaries for the given theta^2 and signal prediction cuts.
This function requires, that no source dependent features,
like Theta were used in the classification.
Parameters
----------
events: pd.DataFrame
DataFrame with event data, needs to contain the columns
`'night'`, `'run'`, `theta_key` and the `theta_off_keys`
prediction_threshold: float
Threshold for the classifier prediction
theta2_cut: float
Selection cut for theta^2 in deg^2
prediction_key: str:
Key to the classifier prediction
theta_key: str
Column name of the column containing theta in degree
theta_off_keys: list[str]
Column names of the column containing theta in degree
for all off regions
'''
runs = runs.set_index(['night', 'run_id'])
runs.sort_index(inplace=True)
# apply prediction threshold cut
selected = events.query(
'{} >= {}'.format(prediction_key, prediction_threshold)
)
on_data, off_data = split_on_off_source_independent(
selected, theta2_cut, theta_key, theta_off_keys
)
alpha = 1 / len(theta_off_keys)
runs['n_on'] = on_data.groupby(['night', 'run_id']).size()
runs['n_on'].fillna(0, inplace=True)
runs['n_off'] = off_data.groupby(['night', 'run_id']).size()
runs['n_off'].fillna(0, inplace=True)
runs['n_excess'] = runs['n_on'] - alpha * runs['n_off']
runs['n_excess_err'] = np.sqrt(runs['n_on'] + alpha**2 * runs['n_off'])
runs['excess_rate_per_h'] = runs['n_excess'] / runs['ontime'] / 3600
runs['excess_rate_per_h_err'] = runs['n_excess_err'] / runs['ontime'] / 3600
runs['significance'] = li_ma_significance(
runs['n_on'], runs['n_off'], alpha
)
runs.reset_index(inplace=True)
return runs
[docs]def split_on_off_source_independent(
events,
theta2_cut,
theta_key='Theta_deg',
theta_off_keys=default_theta_off_keys,
):
'''
Split events dataframe into on and off region
Parameters
----------
events: pd.DataFrame
DataFrame containing event information, required are
`theta_key` and `theta_off_keys`.
theta2_cut: float
Selection cut for theta^2 in deg^2
theta_key: str
Column name of the column containing theta in degree
theta_off_keys: list[str]
Column names of the column containing theta in degree
for all off regions
'''
# apply theta2_cut
theta_cut = np.sqrt(theta2_cut)
on_data = events.query('{} <= {}'.format(theta_key, theta_cut))
off_dfs = []
for region, theta_off_key in enumerate(theta_off_keys, start=1):
off_df = events.query('{} <= {}'.format(theta_off_key, theta_cut))
off_df['off_region'] = region
drop_off_columns(off_df, region, inplace=True)
off_dfs.append(off_df)
off_data = pd.concat(off_dfs)
return on_data, off_data
[docs]def calc_run_summary_source_dependent(
events, runs,
prediction_threshold,
on_prediction_key='signal_prediction',
off_prediction_keys=default_prediction_off_keys,
):
'''
Calculate run summaries for the given signal prediction cuts.
This function needs to be used, if source dependent features like
Theta were used for the classification.
Parameters
----------
events: pd.DataFrame
DataFrame with event data, needs to contain the columns
`'night'`, `'run'`, `theta_key` and the `theta_off_keys`
prediction_threshold: float
Threshold for the signalness prediction
on_prediction_key: str
Key to the classifier prediction for the on region
off_prediction_keys: list[str]
Iterable of keys to the classifier predictions for the off regions
'''
runs = runs.set_index(['night', 'run_id'])
runs.sort_index(inplace=True)
on_data, off_data = split_on_off_source_dependent(
events, prediction_threshold, on_prediction_key, off_prediction_keys
)
alpha = 1 / len(off_prediction_keys)
runs['n_on'] = on_data.groupby(['night', 'run_id']).size()
runs['n_on'].fillna(0, inplace=True)
runs['n_off'] = off_data.groupby(['night', 'run_id']).size()
runs['n_off'].fillna(0, inplace=True)
runs['significance'] = li_ma_significance(
runs['n_on'], runs['n_off'], alpha
)
runs['n_excess'] = runs['n_on'] - alpha * runs['n_off']
runs['n_excess_err'] = np.sqrt(runs['n_on'] + alpha**2 * runs['n_off'])
runs['excess_rate_per_h'] = runs['n_excess'] / runs['ontime'] / 3600
runs['excess_rate_per_h_err'] = runs['n_excess_err'] / runs['ontime'] / 3600
runs.reset_index(inplace=True)
return runs
[docs]def split_on_off_source_dependent(
events,
prediction_threshold,
on_prediction_key='signal_prediction',
off_prediction_keys=default_prediction_off_keys,
):
'''
Split events dataframe into on and off region
For the off regions, keys are renamed to their "on" equivalents
and the "off" keys are dropped.
Parameters
----------
events: pd.DataFrame
DataFrame containing event information, required are
`theta_key` and `theta_off_keys`.
prediction_threshold: float
Threshold for the signalness prediction
on_prediction_key: str
Key to the classifier prediction for the on region
off_prediction_keys: list[str]
Iterable of keys to the classifier predictions for the off regions
'''
on_data = events.query('{} >= {}'.format(
on_prediction_key, prediction_threshold)
).copy()
off_dfs = []
for region, off_key in enumerate(off_prediction_keys, start=1):
off_df = events.query('{} >= {}'.format(
off_key, prediction_threshold)
).copy()
off_df['off_region'] = region
off_df.drop(on_prediction_key, axis=1, inplace=True)
off_df[on_prediction_key] = off_df[off_key]
off_df.drop(off_key, axis=1, inplace=True)
drop_off_columns(off_df, region, inplace=True)
off_dfs.append(off_df)
off_data = pd.concat(off_dfs)
return on_data, off_data
[docs]def drop_off_columns(df, off_region, inplace=False):
'''
Replace the "On" column with the column
of the respective off region.
For example for `off_region = 1`, `Theta` is replaced by
Theta_Off_1 and all Theta_Off_<N> columns are dropped.
Same for all other columns, containing the pattern `_Off_<N>`
'''
if inplace is False:
df = df.copy()
for col in df.columns:
m = off_key_re.match(col)
if m:
on_key, key_region = m.groups()
# if
if int(key_region) == off_region:
df.drop(on_key, axis=1, inplace=True)
df[on_key] = df[col]
df.drop(col, axis=1, inplace=True)
if inplace is False:
return df