Source code for pecos.metrics

"""
The metrics module contains metrics that describe the quality control 
analysis or compute quantities that might be of use in the analysis
"""
import pandas as pd
import numpy as np
import logging

logger = logging.getLogger(__name__)

[docs]def qci(mask, tfilter=None): """ Compute the quality control index (QCI) for each column, defined as: :math:`QCI=\dfrac{\sum_{t\in T}X_{dt}}{|T|}` where :math:`T` is the set of timestamps in the analysis. :math:`X_{dt}` is a data point for column :math:`d` time t` that passed all quality control test. :math:`|T|` is the number of data points in the analysis. Parameters ---------- mask : pandas DataFrame Test results mask, returned from pm.mask tfilter : pandas Series, optional Time filter containing boolean values for each time index Returns ------- pandas Series Quality control index """ if tfilter is not None: mask = mask[tfilter] qci = mask.sum()/mask.shape[0] return qci
[docs]def rmse(data1, data2, tfilter=None): """ Compute the root mean squared error (RMSE) for each column, defined as: :math:`RMSE=\sqrt{\dfrac{\sum{(data_1-data_2)^2}}{n}}` where :math:`data_1` is a time series, :math:`data_2` is a time series, and :math:`n` is a number of data points. Parameters ----------- data1 : pandas DataFrame Data data2 : pandas DataFrame Data. Note, the column names in data1 must equal the column names in data2 tfilter : pandas Series, optional Time filter containing boolean values for each time index Returns ------- pandas Series Root mean squared error """ if len(set(data1.columns).symmetric_difference(set(data2.columns))) > 0: logger.warning("The column names in 'data1' must equal the column names in 'data2'") return if tfilter is not None: data1 = data1[tfilter] data2 = data2[tfilter] rmse = {} for col in data1.columns: rmse[col] = np.sqrt(np.mean(np.power(data1[col] - data2[col],2))) rmse = pd.Series(rmse) return rmse
[docs]def time_integral(data, tfilter=None): """ Compute the time integral (F) for each column, defined as: :math:`F=\int{fdt}` where :math:`f` is a column of data :math:`dt` is the time step between observations. The integral is computed using the trapezoidal rule from numpy.trapz. Results are given in [original data units]*seconds. NaN values are set to 0 for integration. Parameters ----------- data : pandas DataFrame Data tfilter : pandas Series, optional Time filter containing boolean values for each time index Returns ------- pandas Series Integral """ if isinstance(data, pd.Series): data = data.to_frame() if tfilter is not None: data = data[tfilter] data = data.fillna(0) # fill NaN with 0 tdelta = (data.index - data.index[0]).total_seconds() # convert to seconds F = {} for col in data.columns: F[col] = float(np.trapz(data.loc[:,col], tdelta)) F = pd.Series(F) return F
[docs]def time_derivative(data, tfilter=None): """ Compute the derivative (f') of each column, defined as: :math:`f'=\dfrac{df}{dt}` where :math:`f` is a column of data :math:`dt` is the time step between observations. The derivative is computed using central differences from numpy.gradient. Results are given in [original data units]/seconds. Parameters ----------- data : pandas DataFrame Data tfilter : pandas Series, optional Filter containing boolean values for each time index Returns ------- pandas DataFrame Derivative of the data """ if tfilter is not None: data = data[tfilter] tdelta = (data.index - data.index[0]).total_seconds() # convert to seconds f = {} for col in data.columns: f[col] = np.gradient(data.loc[:,col], tdelta) f = pd.DataFrame(f, index=data.index) return f
[docs]def probability_of_detection(observed, actual, tfilter=None): """ Compute probability of detection (PD) for each column, defined as: :math:`PD=\dfrac{TP}{TP+FN}` where :math:`TP` is number of true positives and :math:`FN` is the number of false negatives. Parameters ---------- observed : pandas DataFrame Estimated conditions (True = background, False = anomalous), returned from pm.mask actual : pandas DataFrame Actual conditions, (True = background, False = anomalous). Note, the column names in observed must equal the column names in actual tfilter : pandas Series, optional Filter containing boolean values for each time index Returns ------- pandas Series Probability of detection """ if len(set(observed.columns).symmetric_difference(set(actual.columns))) > 0: logger.warning("The column names in 'observed' must equal the column names in 'actual'") return if tfilter is not None: observed = observed[tfilter] actual = actual[tfilter] PD = {} for col in observed.columns: obs = observed[col] act = actual[col] # True positive (TP) = anomalous condition where tests fail # TP is 1 where the statement is true, Nan where the statement is false TP = (obs.where(obs == False)+1) == (act.where(act == False)+1) TP_count = TP.sum() # False negative (FN) = anomalous condition where tests pass # FN is 1 where the statement is true, Nan where the statement is false FN = (obs.where(obs == True)) == (act.where(act == False)+1) FN_count = FN.sum() PD[col] = TP_count/float(TP_count+FN_count) PD = pd.Series(PD) return PD
[docs]def false_alarm_rate(observed, actual, tfilter=None): """ Compute false alarm rate (FAR) for each column, defined as: :math:`FAR=\dfrac{TN}{TN+FP}` where :math:`TN` is number of true negatives and :math:`FP` is the number of false positives. Parameters ---------- estimated : pandas DataFrame Estimated conditions (True = background, False = anomalous), returned from pm.mask actual : pandas DataFrame Actual conditions, (True = background, False = anomalous). Note, the column names in observed must equal the column names in actual. tfilter : pandas Series, optional Filter containing boolean values for each time index Returns ------- pandas Series False alarm rate """ if len(set(observed.columns).symmetric_difference(set(actual.columns))) > 0: logger.warning("The column names in 'observed' must equal the column names in 'actual'") return # Remove time filter if tfilter is not None: observed = observed[tfilter] actual = actual[tfilter] FAR = {} for col in observed.columns: obs = observed[col] act = actual[col] # True negative (TN) = normal condition where tests pass # TN is 1 where the statement is true, Nan where the statement is false TN = (obs.where(obs == True)) == (act.where(act == True)) TN_count = TN.sum() # False positive (FP) = normal condition where tests fail # FP is 1 where the statement is true, Nan where the statement is false FP = (obs.where(obs == False)+1) == (act.where(act == True)) FP_count = FP.sum() FAR[col] = 1-TN_count/float(TN_count+FP_count) FAR = pd.Series(FAR) return FAR