Source code for fact.analysis.binning
import pandas as pd
import datetime
import numpy as np
from .statistics import li_ma_significance
[docs]def ontime_binning(runs, bin_width_minutes=20):
'''
Calculate bin numbers for given runs.
A new bin is created if either a bin would have more ontime
than `bin_width_minutes` or `run_start` of the next run is
more than `bin_width_minutes` after `run_stop` of the last run.
Parameters
----------
runs: pd.DataFrame
DataFrame containing analysis results and meta data
for each run
bin_width_minutes: number
The desired amount of ontime in each bin.
Note: The ontime in each bin will allways be
slightly less than `bin_width_minutes`
'''
bin_width_sec = bin_width_minutes * 60
bin_number = 0
ontime_sum = 0
bins = []
last_stop = runs['run_start'].iloc[0]
delta_t_max = datetime.timedelta(seconds=bin_width_sec)
for key, row in runs.iterrows():
delta_t = row.run_start - last_stop
last_stop = row.run_stop
if ontime_sum + row.ontime > bin_width_sec or delta_t > delta_t_max:
bin_number += 1
ontime_sum = 0
bins.append(bin_number)
ontime_sum += row.ontime
return pd.Series(bins, index=runs.index)
[docs]def qla_binning(data, bin_width_minutes=20):
'''
The binning algorithm as used by lightcurve.c
'''
bin_number = 0
ontime_sum = 0
bins = []
for key, row in data.iterrows():
if ontime_sum + row.fOnTimeAfterCuts > bin_width_minutes * 60:
bin_number += 1
ontime_sum = 0
bins.append(bin_number)
ontime_sum += row['ontime']
return pd.Series(bins, index=data.index)
[docs]def groupby_observation_blocks(runs):
''' Groupby for consecutive runs of the same source'''
runs = runs.sort_values('run_start')
new_source = runs.fSourceName != runs.fSourceName.shift(1)
observation_blocks = new_source.cumsum()
return runs.groupby(observation_blocks)
[docs]def nightly_binning(runs):
nights = runs['night'].unique()
bins = pd.Series(index=runs.index, dtype=int)
for bin_id, night in enumerate(nights):
bins.loc[runs.night == night] = bin_id
return bins
[docs]def bin_runs(
runs,
alpha=0.2,
binning_function=ontime_binning,
**kwargs
):
'''
Bin runs using `binning_function` to assign bins to
the individual runs.
Calculates n_on, n_off, ontime, n_excess, excess_rate_per_h,
excess_rate_err, li_ma_significance and bin_width
Parameters
----------
runs: pandas.DataFrame
The analysis results and necessary metadata for each run.
Required are: ontime, n_on, n_off, run_start, run_stop, source
alpha: float
The weight for the off regions, e.g. 1 / number of off regions
binning_function: function
A function that takes the run df and returns a
pd.Series containing bin ids with the index of the origininal
dataframe
All `**kwargs` are passed to the binning function
'''
runs = runs.sort_values(by='run_start')
sources = []
for source, df in runs.groupby('source'):
df = df.copy()
df['bin'] = binning_function(df, **kwargs)
binned = df.groupby('bin').aggregate({
'ontime': 'sum',
'n_on': 'sum',
'n_off': 'sum',
'run_start': 'min',
'run_stop': 'max',
})
binned['n_excess'] = binned.n_on - binned.n_off * alpha
binned['excess_rate_per_h'] = binned.n_excess / binned.ontime * 3600
binned['time_width'] = binned.run_stop - binned.run_start
binned['time_mean'] = binned.run_start + 0.5 * binned.time_width
binned['excess_rate_err'] = np.sqrt(binned.n_on + alpha**2 * binned.n_off)
binned['excess_rate_err'] /= binned.ontime / 3600
binned['significance'] = li_ma_significance(
binned.n_on, binned.n_off, 0.2
)
binned['source'] = source
binned['night'] = (
binned.time_mean - pd.Timedelta(hours=12)
).dt.strftime('%Y%m%d').astype(int)
sources.append(binned)
return pd.concat(sources)