I am trying to implement the macro F1 score (F-measure) natively in PyTorch instead of using the already-widely-used sklearn.metrics.f1_score in order to calculate the measure directly on the GPU.
From what I understand, in order to compute the macro F1 score, I need to compute the F1 score with the sensitivity and precision for all labels, then take the average of all these.
My current implementation looks like this:
def confusion_matrix(y_pred: torch.Tensor, y_true: torch.Tensor, n_classes: int):
conf_matrix = torch.zeros([n_classes, n_classes], dtype=torch.int)
y_pred = torch.argmax(y_pred, 1)
for t, p in zip(y_true.view(-1), y_pred.view(-1)):
conf_matrix[t.long(), p.long()] += 1
return conf_matrix
def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
conf_matrix = confusion_matrix(y_pred, y_true, self.classes)
TP = conf_matrix.diag()
f1_scores = torch.zeros(self.classes, dtype=torch.float)
for c in range(self.classes):
idx = torch.ones(self.classes, dtype=torch.long)
idx[c] = 0
FP = conf_matrix[c, idx].sum()
FN = conf_matrix[idx, c].sum()
sensitivity = TP[c] / (TP[c] + FN + self.epsilon)
precision = TP[c] / (TP[c] + FP + self.epsilon)
f1_scores[c] += 2.0 * ((precision * sensitivity) / (precision + sensitivity + self.epsilon))
return f1_scores.mean()
self.classes is the number of labels and self.epsilon is a very small value set to 10-e12 which prevents DivisionByZeroError.
When training, I compute the measure for every batch and take the average of all measures as the final score.
The problem is that when I compare my custom F1 score with sklearn's macro F1 score, they are rarely equal.
# example 1
eval_cce 0.5203, eval_f1 0.8068, eval_acc 81.5455, eval_f1_sci 0.8023,
test_cce 0.4784, test_f1 0.7975, test_acc 82.6732, test_f1_sci 0.8097
# example 2
eval_cce 0.3304, eval_f1 0.8211, eval_acc 87.4955, eval_f1_sci 0.8626,
test_cce 0.3734, test_f1 0.8183, test_acc 85.4996, test_f1_sci 0.8424
# example 3
eval_cce 0.4792, eval_f1 0.7982, eval_acc 81.8482, eval_f1_sci 0.8001,
test_cce 0.4722, test_f1 0.7905, test_acc 82.6533, test_f1_sci 0.8139
While I have tried to scan the internet, most cases cover binary classification. I have yet been able to discover an example to attempts to do what I am trying to.
Is there any obvious issue with my attempt?
I have yet to figure out my mistake. Due to time constraint, I decided to just use the F1 macro score provided by sklearn. While it cannot work directly with GPU tensors, it is fast enough for my case anyway.
However, it would be awesome if anybody can figure this out, so that anybody else that might stumble upon this issue can get their problem resolved.
I have written my own implementation in Pytorch some time ago:
from typing import Tuple
import torch
class F1Score:
"""
Class for f1 calculation in Pytorch.
"""
def __init__(self, average: str = 'weighted'):
"""
Init.
Args:
average: averaging method
"""
self.average = average
if average not in [None, 'micro', 'macro', 'weighted']:
raise ValueError('Wrong value of average parameter')
@staticmethod
def calc_f1_micro(predictions: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
"""
Calculate f1 micro.
Args:
predictions: tensor with predictions
labels: tensor with original labels
Returns:
f1 score
"""
true_positive = torch.eq(labels, predictions).sum().float()
f1_score = torch.div(true_positive, len(labels))
return f1_score
@staticmethod
def calc_f1_count_for_label(predictions: torch.Tensor,
labels: torch.Tensor, label_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Calculate f1 and true count for the label
Args:
predictions: tensor with predictions
labels: tensor with original labels
label_id: id of current label
Returns:
f1 score and true count for label
"""
# label count
true_count = torch.eq(labels, label_id).sum()
# true positives: labels equal to prediction and to label_id
true_positive = torch.logical_and(torch.eq(labels, predictions),
torch.eq(labels, label_id)).sum().float()
# precision for label
precision = torch.div(true_positive, torch.eq(predictions, label_id).sum().float())
# replace nan values with 0
precision = torch.where(torch.isnan(precision),
torch.zeros_like(precision).type_as(true_positive),
precision)
# recall for label
recall = torch.div(true_positive, true_count)
# f1
f1 = 2 * precision * recall / (precision + recall)
# replace nan values with 0
f1 = torch.where(torch.isnan(f1), torch.zeros_like(f1).type_as(true_positive), f1)
return f1, true_count
def __call__(self, predictions: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
"""
Calculate f1 score based on averaging method defined in init.
Args:
predictions: tensor with predictions
labels: tensor with original labels
Returns:
f1 score
"""
# simpler calculation for micro
if self.average == 'micro':
return self.calc_f1_micro(predictions, labels)
f1_score = 0
for label_id in range(1, len(labels.unique()) + 1):
f1, true_count = self.calc_f1_count_for_label(predictions, labels, label_id)
if self.average == 'weighted':
f1_score += f1 * true_count
elif self.average == 'macro':
f1_score += f1
if self.average == 'weighted':
f1_score = torch.div(f1_score, len(labels))
elif self.average == 'macro':
f1_score = torch.div(f1_score, len(labels.unique()))
return f1_score
You can test it in the following way:
from sklearn.metrics import f1_score
import numpy as np
errors = 0
for _ in range(10):
labels = torch.randint(1, 10, (4096, 100)).flatten()
predictions = torch.randint(1, 10, (4096, 100)).flatten()
labels1 = labels.numpy()
predictions1 = predictions.numpy()
for av in ['micro', 'macro', 'weighted']:
f1_metric = F1Score(av)
my_pred = f1_metric(predictions, labels)
f1_pred = f1_score(labels1, predictions1, average=av)
if not np.isclose(my_pred.item(), f1_pred.item()):
print('!' * 50)
print(f1_pred, my_pred, av)
errors += 1
if errors == 0:
print('No errors!')
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With