Source code for fact.analysis.core

import numpy as np
import pandas as pd
import re

from .statistics import li_ma_significance

default_theta_off_keys = tuple('Theta_Off_{}_deg'.format(i) for i in range(1, 6))
default_prediction_off_keys = tuple(
    'background_prediction_{}'.format(i) for i in range(1, 6)
)


off_key_re = re.compile('([a-zA-z1-9]+)_Off_([0-9])')


[docs]def calc_run_summary_source_independent( events, runs, prediction_threshold, theta2_cut, prediction_key='signal_prediction', theta_key='Theta_deg', theta_off_keys=default_theta_off_keys, ): ''' Calculate run summaries for the given theta^2 and signal prediction cuts. This function requires, that no source dependent features, like Theta were used in the classification. Parameters ---------- events: pd.DataFrame DataFrame with event data, needs to contain the columns `'night'`, `'run'`, `theta_key` and the `theta_off_keys` prediction_threshold: float Threshold for the classifier prediction theta2_cut: float Selection cut for theta^2 in deg^2 prediction_key: str: Key to the classifier prediction theta_key: str Column name of the column containing theta in degree theta_off_keys: list[str] Column names of the column containing theta in degree for all off regions ''' runs = runs.set_index(['night', 'run_id']) runs.sort_index(inplace=True) # apply prediction threshold cut selected = events.query( '{} >= {}'.format(prediction_key, prediction_threshold) ) on_data, off_data = split_on_off_source_independent( selected, theta2_cut, theta_key, theta_off_keys ) alpha = 1 / len(theta_off_keys) runs['n_on'] = on_data.groupby(['night', 'run_id']).size() runs['n_on'].fillna(0, inplace=True) runs['n_off'] = off_data.groupby(['night', 'run_id']).size() runs['n_off'].fillna(0, inplace=True) runs['n_excess'] = runs['n_on'] - alpha * runs['n_off'] runs['n_excess_err'] = np.sqrt(runs['n_on'] + alpha**2 * runs['n_off']) runs['excess_rate_per_h'] = runs['n_excess'] / runs['ontime'] / 3600 runs['excess_rate_per_h_err'] = runs['n_excess_err'] / runs['ontime'] / 3600 runs['significance'] = li_ma_significance( runs['n_on'], runs['n_off'], alpha ) runs.reset_index(inplace=True) return runs
[docs]def split_on_off_source_independent( events, theta2_cut, theta_key='Theta_deg', theta_off_keys=default_theta_off_keys, ): ''' Split events dataframe into on and off region Parameters ---------- events: pd.DataFrame DataFrame containing event information, required are `theta_key` and `theta_off_keys`. theta2_cut: float Selection cut for theta^2 in deg^2 theta_key: str Column name of the column containing theta in degree theta_off_keys: list[str] Column names of the column containing theta in degree for all off regions ''' # apply theta2_cut theta_cut = np.sqrt(theta2_cut) on_data = events.query('{} <= {}'.format(theta_key, theta_cut)) off_dfs = [] for region, theta_off_key in enumerate(theta_off_keys, start=1): off_df = events.query('{} <= {}'.format(theta_off_key, theta_cut)) off_df['off_region'] = region drop_off_columns(off_df, region, inplace=True) off_dfs.append(off_df) off_data = pd.concat(off_dfs) return on_data, off_data
[docs]def calc_run_summary_source_dependent( events, runs, prediction_threshold, on_prediction_key='signal_prediction', off_prediction_keys=default_prediction_off_keys, ): ''' Calculate run summaries for the given signal prediction cuts. This function needs to be used, if source dependent features like Theta were used for the classification. Parameters ---------- events: pd.DataFrame DataFrame with event data, needs to contain the columns `'night'`, `'run'`, `theta_key` and the `theta_off_keys` prediction_threshold: float Threshold for the signalness prediction on_prediction_key: str Key to the classifier prediction for the on region off_prediction_keys: list[str] Iterable of keys to the classifier predictions for the off regions ''' runs = runs.set_index(['night', 'run_id']) runs.sort_index(inplace=True) on_data, off_data = split_on_off_source_dependent( events, prediction_threshold, on_prediction_key, off_prediction_keys ) alpha = 1 / len(off_prediction_keys) runs['n_on'] = on_data.groupby(['night', 'run_id']).size() runs['n_on'].fillna(0, inplace=True) runs['n_off'] = off_data.groupby(['night', 'run_id']).size() runs['n_off'].fillna(0, inplace=True) runs['significance'] = li_ma_significance( runs['n_on'], runs['n_off'], alpha ) runs['n_excess'] = runs['n_on'] - alpha * runs['n_off'] runs['n_excess_err'] = np.sqrt(runs['n_on'] + alpha**2 * runs['n_off']) runs['excess_rate_per_h'] = runs['n_excess'] / runs['ontime'] / 3600 runs['excess_rate_per_h_err'] = runs['n_excess_err'] / runs['ontime'] / 3600 runs.reset_index(inplace=True) return runs
[docs]def split_on_off_source_dependent( events, prediction_threshold, on_prediction_key='signal_prediction', off_prediction_keys=default_prediction_off_keys, ): ''' Split events dataframe into on and off region For the off regions, keys are renamed to their "on" equivalents and the "off" keys are dropped. Parameters ---------- events: pd.DataFrame DataFrame containing event information, required are `theta_key` and `theta_off_keys`. prediction_threshold: float Threshold for the signalness prediction on_prediction_key: str Key to the classifier prediction for the on region off_prediction_keys: list[str] Iterable of keys to the classifier predictions for the off regions ''' on_data = events.query('{} >= {}'.format( on_prediction_key, prediction_threshold) ).copy() off_dfs = [] for region, off_key in enumerate(off_prediction_keys, start=1): off_df = events.query('{} >= {}'.format( off_key, prediction_threshold) ).copy() off_df['off_region'] = region off_df.drop(on_prediction_key, axis=1, inplace=True) off_df[on_prediction_key] = off_df[off_key] off_df.drop(off_key, axis=1, inplace=True) drop_off_columns(off_df, region, inplace=True) off_dfs.append(off_df) off_data = pd.concat(off_dfs) return on_data, off_data
[docs]def drop_off_columns(df, off_region, inplace=False): ''' Replace the "On" column with the column of the respective off region. For example for `off_region = 1`, `Theta` is replaced by Theta_Off_1 and all Theta_Off_<N> columns are dropped. Same for all other columns, containing the pattern `_Off_<N>` ''' if inplace is False: df = df.copy() for col in df.columns: m = off_key_re.match(col) if m: on_key, key_region = m.groups() # if if int(key_region) == off_region: df.drop(on_key, axis=1, inplace=True) df[on_key] = df[col] df.drop(col, axis=1, inplace=True) if inplace is False: return df