Thomas Boulier commited on
Commit
b394f99
·
1 Parent(s): 0f78bcc

refactor: move mappings to data_loaders.py

Browse files
tasks/data/data_loaders.py CHANGED
@@ -15,7 +15,7 @@ class DataLoader(ABC):
15
  pass
16
 
17
  class TextDataLoader(DataLoader):
18
- def __init__(self, request: TextEvaluationRequest, light: bool = False):
19
  self.label_mapping = {
20
  "0_not_relevant": 0,
21
  "1_not_happening": 1,
@@ -43,3 +43,9 @@ class TextDataLoader(DataLoader):
43
 
44
  def get_test_dataset(self):
45
  return self.dataset["test"]
 
 
 
 
 
 
 
15
  pass
16
 
17
  class TextDataLoader(DataLoader):
18
+ def __init__(self, request: TextEvaluationRequest = TextEvaluationRequest(), light: bool = False):
19
  self.label_mapping = {
20
  "0_not_relevant": 0,
21
  "1_not_happening": 1,
 
43
 
44
  def get_test_dataset(self):
45
  return self.dataset["test"]
46
+
47
+ def get_label_to_id_mapping(self):
48
+ return self.label_mapping
49
+
50
+ def get_id_to_label_mapping(self):
51
+ return {v: k for k, v in self.label_mapping.items()}
tasks/models/text_classifiers.py CHANGED
@@ -8,6 +8,7 @@ from transformers import AutoTokenizer, DataCollatorWithPadding, create_optimize
8
  import evaluate
9
 
10
  from tasks.data.data_loaders import TextDataLoader
 
11
 
12
  # Define label mappings
13
  LABEL_TO_ID_MAPPING = {
@@ -23,7 +24,7 @@ LABEL_TO_ID_MAPPING = {
23
  ID_TO_LABEL_MAPPING = {v: k for k, v in LABEL_TO_ID_MAPPING.items()}
24
 
25
  class PredictionModel(ABC):
26
- def __init__(self):
27
  self.description = ""
28
 
29
  @abstractmethod
@@ -44,12 +45,24 @@ class PredictionModel(ABC):
44
  pass
45
 
46
  @abstractmethod
47
- def train(self, dataset):
 
 
 
 
 
 
 
 
 
 
 
 
48
  pass
49
 
50
 
51
  class BaselineModel(PredictionModel):
52
- def __init__(self):
53
  super().__init__()
54
  self.description = "Random Baseline (with Strategy Pattern, from another module)"
55
 
@@ -60,10 +73,12 @@ class BaselineModel(PredictionModel):
60
  pass
61
 
62
  class DistilBERTModel(PredictionModel):
63
- def __init__(self):
64
  super().__init__()
65
  self.description = "DistilBERT Model"
66
  self.model = None
 
 
67
 
68
  # tokenizer
69
  self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
@@ -83,9 +98,10 @@ class DistilBERTModel(PredictionModel):
83
  logits = outputs.logits
84
  probabilities = tf.nn.softmax(logits)
85
  predicted_label = self.model.config.id2label[tf.argmax(probabilities, axis=1).numpy()[0]]
86
- return LABEL_TO_ID_MAPPING[predicted_label]
87
 
88
  def train(self, dataset):
 
89
  tokenized_data = self.pre_process_data(dataset)
90
 
91
  # Training setup
@@ -99,8 +115,8 @@ class DistilBERTModel(PredictionModel):
99
  self.model = TFAutoModelForSequenceClassification.from_pretrained(
100
  "distilbert-base-uncased",
101
  num_labels=8,
102
- id2label=ID_TO_LABEL_MAPPING,
103
- label2id=LABEL_TO_ID_MAPPING
104
  )
105
 
106
  # Convert datasets to tf.data.Dataset format
 
8
  import evaluate
9
 
10
  from tasks.data.data_loaders import TextDataLoader
11
+ from tasks.utils.evaluation import TextEvaluationRequest
12
 
13
  # Define label mappings
14
  LABEL_TO_ID_MAPPING = {
 
24
  ID_TO_LABEL_MAPPING = {v: k for k, v in LABEL_TO_ID_MAPPING.items()}
25
 
26
  class PredictionModel(ABC):
27
+ def __init__(self, data_loader: TextDataLoader = TextDataLoader()):
28
  self.description = ""
29
 
30
  @abstractmethod
 
45
  pass
46
 
47
  @abstractmethod
48
+ def train(self, dataset) -> None:
49
+ """
50
+ Train the model on a given dataset.
51
+
52
+ Parameters:
53
+ -----------
54
+ dataset:
55
+ The dataset to train on.
56
+
57
+ Returns:
58
+ --------
59
+ None
60
+ """
61
  pass
62
 
63
 
64
  class BaselineModel(PredictionModel):
65
+ def __init__(self, data_loader: TextDataLoader = TextDataLoader()):
66
  super().__init__()
67
  self.description = "Random Baseline (with Strategy Pattern, from another module)"
68
 
 
73
  pass
74
 
75
  class DistilBERTModel(PredictionModel):
76
+ def __init__(self, data_loader: TextDataLoader = TextDataLoader()):
77
  super().__init__()
78
  self.description = "DistilBERT Model"
79
  self.model = None
80
+ self.label_to_id_mapping = data_loader.get_label_to_id_mapping()
81
+ self.id_to_label_mapping = data_loader.get_id_to_label_mapping()
82
 
83
  # tokenizer
84
  self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 
98
  logits = outputs.logits
99
  probabilities = tf.nn.softmax(logits)
100
  predicted_label = self.model.config.id2label[tf.argmax(probabilities, axis=1).numpy()[0]]
101
+ return self.label_to_id_mapping[predicted_label]
102
 
103
  def train(self, dataset):
104
+ # Pre-process data
105
  tokenized_data = self.pre_process_data(dataset)
106
 
107
  # Training setup
 
115
  self.model = TFAutoModelForSequenceClassification.from_pretrained(
116
  "distilbert-base-uncased",
117
  num_labels=8,
118
+ id2label=self.id_to_label_mapping,
119
+ label2id=self.label_to_id_mapping
120
  )
121
 
122
  # Convert datasets to tf.data.Dataset format