ROC & AUC
A metric traditionally used for binary classification task.
sklearn API
from sklearn import metrics
# one step
auc = metrics.roc_auc_score(truth, pred)
# two steps
fpr, tpr, thresholds = metrics.roc_curve(truth, pred)
auc = metrics.auc(fpr, tpr)
Great Illustration
Receiver Operation Curve
A graphic plot metric for binary classifier.
\[
\displaylines{
\begin{eqnarray}
TPR = \frac{TP}{P}=\frac {TP}{TP+FN} \\
FPR = \frac{FP}{N}=\frac {FP}{FP+TN} \\
\end{eqnarray}
}
\]
In binary classification, the prediction is usually a continuous variable.
We need a threshold \(T\) to determine whether the prediction is positive or negative.
ROC curve plots parametrically \(TPR(T)\) vs. \(FPR(T)\).
Area Under the Curve
Use \(TPR\) as y axis and \(FPR\) as x axis.
Self Implementations
# naive thresholding (lot of duplicates)
def get_auc(truth, pred):
truth = np.array(truth)
pred = np.array(pred)
def get_tpr_fpr(truth, pred, thresh):
pred = (pred >= thresh).astype(np.int)
tp = np.logical_and(truth == 1, pred == 1).sum()
tn = np.logical_and(truth == 0, pred == 0).sum()
fn = np.logical_and(truth == 1, pred == 0).sum()
fp = np.logical_and(truth == 0, pred == 1).sum()
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
return tpr, fpr
# get tprs, fprs
tprs = []
fprs = []
threshs = []
for thresh in np.arange(1, -0.01, -0.01): # descending, so we get [0,0] to [1,1]
tpr, fpr = get_tpr_fpr(truth, pred, thresh)
tprs.append(tpr)
fprs.append(fpr)
threshs.append(thresh)
# area
area = 0
for i in range(1, len(tprs)):
area += (tprs[i]+tprs[i-1])*(fprs[i]-fprs[i-1])/2
return area
# we can simply use pred as thresholds!
def get_auc(truth, pred):
truth = np.array(truth)
pred = np.array(pred)
def get_tpr_fpr(truth, pred, thresh):
pred = (pred >= thresh).astype(np.int)
tp = np.logical_and(truth == 1, pred == 1).sum()
tn = np.logical_and(truth == 0, pred == 0).sum()
fn = np.logical_and(truth == 1, pred == 0).sum()
fp = np.logical_and(truth == 0, pred == 1).sum()
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
return tpr, fpr
# get tprs, fprs
idx = np.argsort(pred)[::-1] # descending threshs
pred = pred[idx]
truth = truth[idx]
threshs = np.insert(pred, 0, pred[0]+1) # insert a large thresh at beginning to get [0,0]
tprs = [0]
fprs = [0]
for thresh in pred:
tpr, fpr = get_tpr_fpr(truth, pred, thresh)
tprs.append(tpr)
fprs.append(fpr)
# area
area = 0
for i in range(1, len(tprs)):
area += (tprs[i]+tprs[i-1])*(fprs[i]-fprs[i-1])/2
return area
# amazing cumsum
def get_auc_final(truth, pred):
truth = np.array(truth)
pred = np.array(pred)
# get tprs, fprs
idx = np.argsort(pred)[::-1]
pred = pred[idx]
truth = truth[idx]
tprs = np.cumsum(truth)
fprs = np.cumsum(1 - truth)
tprs = tprs / tprs[-1] # get rate
fprs = fprs / fprs[-1]
tprs = np.insert(tprs, 0, 0)
fprs = np.insert(fprs, 0, 0)
threshs = np.insert(pred, 0, pred[0]+1)
# area
area = 0
for i in range(1, len(tprs)):
area += (tprs[i]+tprs[i-1])*(fprs[i]-fprs[i-1])/2
return area