Spaces:
Sleeping
Sleeping
Thomas Boulier
commited on
Commit
·
b394f99
1
Parent(s):
0f78bcc
refactor: move mappings to data_loaders.py
Browse files
tasks/data/data_loaders.py
CHANGED
|
@@ -15,7 +15,7 @@ class DataLoader(ABC):
|
|
| 15 |
pass
|
| 16 |
|
| 17 |
class TextDataLoader(DataLoader):
|
| 18 |
-
def __init__(self, request: TextEvaluationRequest, light: bool = False):
|
| 19 |
self.label_mapping = {
|
| 20 |
"0_not_relevant": 0,
|
| 21 |
"1_not_happening": 1,
|
|
@@ -43,3 +43,9 @@ class TextDataLoader(DataLoader):
|
|
| 43 |
|
| 44 |
def get_test_dataset(self):
|
| 45 |
return self.dataset["test"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
pass
|
| 16 |
|
| 17 |
class TextDataLoader(DataLoader):
|
| 18 |
+
def __init__(self, request: TextEvaluationRequest = TextEvaluationRequest(), light: bool = False):
|
| 19 |
self.label_mapping = {
|
| 20 |
"0_not_relevant": 0,
|
| 21 |
"1_not_happening": 1,
|
|
|
|
| 43 |
|
| 44 |
def get_test_dataset(self):
|
| 45 |
return self.dataset["test"]
|
| 46 |
+
|
| 47 |
+
def get_label_to_id_mapping(self):
|
| 48 |
+
return self.label_mapping
|
| 49 |
+
|
| 50 |
+
def get_id_to_label_mapping(self):
|
| 51 |
+
return {v: k for k, v in self.label_mapping.items()}
|
tasks/models/text_classifiers.py
CHANGED
|
@@ -8,6 +8,7 @@ from transformers import AutoTokenizer, DataCollatorWithPadding, create_optimize
|
|
| 8 |
import evaluate
|
| 9 |
|
| 10 |
from tasks.data.data_loaders import TextDataLoader
|
|
|
|
| 11 |
|
| 12 |
# Define label mappings
|
| 13 |
LABEL_TO_ID_MAPPING = {
|
|
@@ -23,7 +24,7 @@ LABEL_TO_ID_MAPPING = {
|
|
| 23 |
ID_TO_LABEL_MAPPING = {v: k for k, v in LABEL_TO_ID_MAPPING.items()}
|
| 24 |
|
| 25 |
class PredictionModel(ABC):
|
| 26 |
-
def __init__(self):
|
| 27 |
self.description = ""
|
| 28 |
|
| 29 |
@abstractmethod
|
|
@@ -44,12 +45,24 @@ class PredictionModel(ABC):
|
|
| 44 |
pass
|
| 45 |
|
| 46 |
@abstractmethod
|
| 47 |
-
def train(self, dataset):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
pass
|
| 49 |
|
| 50 |
|
| 51 |
class BaselineModel(PredictionModel):
|
| 52 |
-
def __init__(self):
|
| 53 |
super().__init__()
|
| 54 |
self.description = "Random Baseline (with Strategy Pattern, from another module)"
|
| 55 |
|
|
@@ -60,10 +73,12 @@ class BaselineModel(PredictionModel):
|
|
| 60 |
pass
|
| 61 |
|
| 62 |
class DistilBERTModel(PredictionModel):
|
| 63 |
-
def __init__(self):
|
| 64 |
super().__init__()
|
| 65 |
self.description = "DistilBERT Model"
|
| 66 |
self.model = None
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# tokenizer
|
| 69 |
self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
|
@@ -83,9 +98,10 @@ class DistilBERTModel(PredictionModel):
|
|
| 83 |
logits = outputs.logits
|
| 84 |
probabilities = tf.nn.softmax(logits)
|
| 85 |
predicted_label = self.model.config.id2label[tf.argmax(probabilities, axis=1).numpy()[0]]
|
| 86 |
-
return
|
| 87 |
|
| 88 |
def train(self, dataset):
|
|
|
|
| 89 |
tokenized_data = self.pre_process_data(dataset)
|
| 90 |
|
| 91 |
# Training setup
|
|
@@ -99,8 +115,8 @@ class DistilBERTModel(PredictionModel):
|
|
| 99 |
self.model = TFAutoModelForSequenceClassification.from_pretrained(
|
| 100 |
"distilbert-base-uncased",
|
| 101 |
num_labels=8,
|
| 102 |
-
id2label=
|
| 103 |
-
label2id=
|
| 104 |
)
|
| 105 |
|
| 106 |
# Convert datasets to tf.data.Dataset format
|
|
|
|
| 8 |
import evaluate
|
| 9 |
|
| 10 |
from tasks.data.data_loaders import TextDataLoader
|
| 11 |
+
from tasks.utils.evaluation import TextEvaluationRequest
|
| 12 |
|
| 13 |
# Define label mappings
|
| 14 |
LABEL_TO_ID_MAPPING = {
|
|
|
|
| 24 |
ID_TO_LABEL_MAPPING = {v: k for k, v in LABEL_TO_ID_MAPPING.items()}
|
| 25 |
|
| 26 |
class PredictionModel(ABC):
|
| 27 |
+
def __init__(self, data_loader: TextDataLoader = TextDataLoader()):
|
| 28 |
self.description = ""
|
| 29 |
|
| 30 |
@abstractmethod
|
|
|
|
| 45 |
pass
|
| 46 |
|
| 47 |
@abstractmethod
|
| 48 |
+
def train(self, dataset) -> None:
|
| 49 |
+
"""
|
| 50 |
+
Train the model on a given dataset.
|
| 51 |
+
|
| 52 |
+
Parameters:
|
| 53 |
+
-----------
|
| 54 |
+
dataset:
|
| 55 |
+
The dataset to train on.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
--------
|
| 59 |
+
None
|
| 60 |
+
"""
|
| 61 |
pass
|
| 62 |
|
| 63 |
|
| 64 |
class BaselineModel(PredictionModel):
|
| 65 |
+
def __init__(self, data_loader: TextDataLoader = TextDataLoader()):
|
| 66 |
super().__init__()
|
| 67 |
self.description = "Random Baseline (with Strategy Pattern, from another module)"
|
| 68 |
|
|
|
|
| 73 |
pass
|
| 74 |
|
| 75 |
class DistilBERTModel(PredictionModel):
|
| 76 |
+
def __init__(self, data_loader: TextDataLoader = TextDataLoader()):
|
| 77 |
super().__init__()
|
| 78 |
self.description = "DistilBERT Model"
|
| 79 |
self.model = None
|
| 80 |
+
self.label_to_id_mapping = data_loader.get_label_to_id_mapping()
|
| 81 |
+
self.id_to_label_mapping = data_loader.get_id_to_label_mapping()
|
| 82 |
|
| 83 |
# tokenizer
|
| 84 |
self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
|
|
|
| 98 |
logits = outputs.logits
|
| 99 |
probabilities = tf.nn.softmax(logits)
|
| 100 |
predicted_label = self.model.config.id2label[tf.argmax(probabilities, axis=1).numpy()[0]]
|
| 101 |
+
return self.label_to_id_mapping[predicted_label]
|
| 102 |
|
| 103 |
def train(self, dataset):
|
| 104 |
+
# Pre-process data
|
| 105 |
tokenized_data = self.pre_process_data(dataset)
|
| 106 |
|
| 107 |
# Training setup
|
|
|
|
| 115 |
self.model = TFAutoModelForSequenceClassification.from_pretrained(
|
| 116 |
"distilbert-base-uncased",
|
| 117 |
num_labels=8,
|
| 118 |
+
id2label=self.id_to_label_mapping,
|
| 119 |
+
label2id=self.label_to_id_mapping
|
| 120 |
)
|
| 121 |
|
| 122 |
# Convert datasets to tf.data.Dataset format
|