Sergiu2404 commited on
Commit
d694943
·
1 Parent(s): 92b3bd3

refactoring

Browse files
Files changed (4) hide show
  1. README.MD +26 -0
  2. api_inference.py +37 -0
  3. config.json +1 -0
  4. fin_tinybert_pytorch.py +301 -0
README.MD ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: mit
4
+ tags:
5
+ - text-classification
6
+ - finance
7
+ - sentiment-analysis
8
+ datasets:
9
+ - financial_sentiment
10
+ metrics:
11
+ - accuracy
12
+ pipeline_tag: text-classification
13
+ ---
14
+
15
+ # FinTinyBERT - Financial Sentiment Analysis Model
16
+
17
+ This model performs sentiment analysis on financial texts, classifying them as positive, negative, or neutral.
18
+
19
+ ## Usage
20
+
21
+ ```python
22
+ from transformers import pipeline
23
+
24
+ classifier = pipeline("text-classification", model="Sergiu2404/fin_tinybert")
25
+ result = classifier("Company profits are rising.")
26
+ print(result)
api_inference.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer
3
+ from fin_tinybert_pytorch import TinyFinBERTRegressor
4
+
5
+
6
+ class InferenceAPI:
7
+ def __init__(self):
8
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+ self.model = TinyFinBERTRegressor()
10
+ self.model.load_state_dict(torch.load("./saved_model/pytorch_model.bin", map_location=self.device))
11
+ self.model.to(self.device)
12
+ self.model.eval()
13
+ self.tokenizer = AutoTokenizer.from_pretrained("./saved_model")
14
+
15
+ def __call__(self, inputs):
16
+ if not isinstance(inputs, list):
17
+ inputs = [inputs]
18
+
19
+ results = []
20
+ for text in inputs:
21
+ encoded = self.tokenizer(text, return_tensors="pt", truncation=True,
22
+ padding='max_length', max_length=128)
23
+ encoded = {k: v.to(self.device) for k, v in encoded.items() if k != "token_type_ids"}
24
+
25
+ with torch.no_grad():
26
+ score = self.model(**encoded)["score"].item()
27
+
28
+ sentiment = "positive" if score > 0.3 else "negative" if score < -0.3 else "neutral"
29
+
30
+ results.append({
31
+ "label": sentiment,
32
+ "score": round(score, 4)
33
+ })
34
+
35
+ if len(results) == 1:
36
+ return results[0]
37
+ return results
config.json CHANGED
@@ -2,6 +2,7 @@
2
  "architectures": [
3
  "BertModel"
4
  ],
 
5
  "attention_probs_dropout_prob": 0.1,
6
  "hidden_act": "gelu",
7
  "hidden_size": 312,
 
2
  "architectures": [
3
  "BertModel"
4
  ],
5
+ "pipeline_tag": "text-classification",
6
  "attention_probs_dropout_prob": 0.1,
7
  "hidden_act": "gelu",
8
  "hidden_size": 312,
fin_tinybert_pytorch.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import pandas as pd
4
+ from datasets import Dataset
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import mean_squared_error, r2_score
7
+ from transformers import AutoTokenizer, Trainer, TrainingArguments, IntervalStrategy
8
+ import re
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+ from transformers import AutoModel, AutoConfig, AutoTokenizer, Trainer, TrainingArguments, IntervalStrategy
12
+
13
+ from nltk.corpus import stopwords
14
+ import spacy
15
+
16
+
17
+ class TinyFinBERTRegressor(nn.Module):
18
+ def __init__(self, pretrained_model='huawei-noah/TinyBERT_General_4L_312D'):
19
+ super().__init__()
20
+ if pretrained_model:
21
+ self.config = AutoConfig.from_pretrained(pretrained_model)
22
+ self.bert = AutoModel.from_pretrained(pretrained_model, config=self.config)
23
+ else:
24
+ self.config = AutoConfig()
25
+ self.bert = AutoModel(self.config)
26
+ self.regressor = nn.Linear(self.config.hidden_size, 1)
27
+
28
+ # Manually register the position_ids buffer to avoid missing key error
29
+ self.bert.embeddings.register_buffer(
30
+ "position_ids",
31
+ torch.arange(512).expand((1, -1)),
32
+ persistent=False,
33
+ )
34
+
35
+ def forward(self, input_ids=None, attention_mask=None, labels=None):
36
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
37
+ cls_output = outputs.last_hidden_state[:, 0]
38
+ score = self.regressor(cls_output).squeeze()
39
+ loss = F.mse_loss(score, labels) if labels is not None else None
40
+ return {'loss': loss, 'score': score}
41
+
42
+
43
+ def preprocess_texts(texts):
44
+ nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"]) # Speeds up processing
45
+ negations = {'no', 'not', 'none', 'nobody', 'nothing', 'neither', 'nowhere', 'never',
46
+ 'hardly', 'scarcely', 'barely', "n't", "without", "unless", "nor"}
47
+ stop_words = set(stopwords.words('english')) - negations
48
+
49
+ processed = []
50
+ for text in texts:
51
+ text = text.lower()
52
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
53
+ doc = nlp(text)
54
+ tokens = [
55
+ token.lemma_ for token in doc
56
+ if token.lemma_.strip() # token.lemma_ not in stop_words and
57
+ ]
58
+ processed.append(' '.join(tokens))
59
+ return processed
60
+
61
+
62
+ def load_phrasebank(path):
63
+ with open(path, 'r', encoding='latin1') as f:
64
+ lines = f.readlines()
65
+ sents, scores = [], []
66
+ for line in lines:
67
+ if '@' in line:
68
+ s, l = line.strip().split('@')
69
+ score = 0.0 if l.lower() == 'neutral' else (-1.0 if l.lower() == 'negative' else 1.0)
70
+ sents.append(s)
71
+ scores.append(score)
72
+ return pd.DataFrame({'text': sents, 'score': scores})
73
+
74
+
75
+ def load_words_phrases(path):
76
+ with open(path, 'r', encoding='latin1') as f:
77
+ lines = f.readlines()
78
+ data = []
79
+ for line in lines:
80
+ line = line.strip()
81
+ match = re.search(r',(-?\d+\.?\d*)$', line)
82
+ if match:
83
+ text = line[:match.start()].strip()
84
+ score = float(match.group(1))
85
+ data.append((text, score))
86
+ return pd.DataFrame(data, columns=["text", "score"])
87
+
88
+
89
+ def train_model(phrase_path, words_path, save_path):
90
+ os.makedirs(save_path, exist_ok=True)
91
+
92
+ # Set device
93
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
94
+ print(f"Using device: {device}")
95
+
96
+ phrase_df = load_phrasebank(phrase_path)
97
+ words_df = load_words_phrases(words_path)
98
+
99
+ phrase_df['text'] = preprocess_texts(phrase_df['text'])
100
+ words_df['text'] = preprocess_texts(words_df['text'])
101
+
102
+ train_phrase, test_phrase = train_test_split(phrase_df, test_size=0.2, random_state=42)
103
+ train_df = pd.concat([train_phrase, words_df])
104
+ test_df = test_phrase.reset_index(drop=True)
105
+
106
+ tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
107
+
108
+ def tokenize(batch):
109
+ tokens = tokenizer(batch["text"], padding='max_length', truncation=True, max_length=128)
110
+ tokens["labels"] = batch["score"]
111
+ return tokens
112
+
113
+ train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True)
114
+ test_dataset = Dataset.from_pandas(test_df).map(tokenize, batched=True)
115
+
116
+ args = TrainingArguments(
117
+ output_dir=os.path.join(save_path, "results"),
118
+ eval_strategy=IntervalStrategy.EPOCH,
119
+ save_strategy=IntervalStrategy.EPOCH,
120
+ learning_rate=2e-5,
121
+ per_device_train_batch_size=16,
122
+ per_device_eval_batch_size=64,
123
+ num_train_epochs=5,
124
+ weight_decay=0.01,
125
+ load_best_model_at_end=True,
126
+ metric_for_best_model="eval_loss"
127
+ )
128
+
129
+ model = TinyFinBERTRegressor().to(device)
130
+
131
+ trainer = Trainer(
132
+ model=model,
133
+ args=args,
134
+ train_dataset=train_dataset,
135
+ eval_dataset=test_dataset,
136
+ tokenizer=tokenizer,
137
+ compute_metrics=lambda pred: {
138
+ "mse": mean_squared_error(pred.label_ids, pred.predictions),
139
+ "r2": r2_score(pred.label_ids, pred.predictions)
140
+ }
141
+ )
142
+
143
+ trainer.train()
144
+
145
+ # Save the model and tokenizer
146
+ model_to_save = model.module if hasattr(model, 'module') else model # Handle distributed/parallel training
147
+ torch.save(model_to_save.state_dict(), os.path.join(save_path, "pytorch_model.bin"))
148
+ tokenizer.save_pretrained(save_path)
149
+ print(f"Model saved to {save_path}")
150
+
151
+
152
+ from sklearn.metrics import (
153
+ mean_squared_error, r2_score,
154
+ accuracy_score, precision_score, recall_score, f1_score,
155
+ roc_auc_score, confusion_matrix, cohen_kappa_score
156
+ )
157
+ from sklearn.preprocessing import label_binarize
158
+
159
+
160
+ def evaluate_model(phrase_path, model_path):
161
+ # Set device
162
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
163
+ print(f"Using device: {device}")
164
+
165
+ phrase_df = load_phrasebank(phrase_path)
166
+ _, test_df = train_test_split(phrase_df, test_size=0.2, random_state=42)
167
+ test_df['text'] = preprocess_texts(test_df['text'])
168
+
169
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
170
+ model = TinyFinBERTRegressor()
171
+ model.load_state_dict(torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location=device))
172
+ model.to(device)
173
+ model.eval()
174
+
175
+ y_true, y_pred, y_scores = [], [], []
176
+
177
+ for _, row in test_df.iterrows():
178
+ inputs = tokenizer(row["text"], return_tensors="pt", truncation=True, padding='max_length', max_length=128)
179
+ inputs = {k: v.to(device) for k, v in inputs.items() if k != "token_type_ids"}
180
+ with torch.no_grad():
181
+ score = model(**inputs)["score"].item()
182
+ y_scores.append(score)
183
+ y_true.append(row["score"])
184
+
185
+ # regression metrics
186
+ mse = mean_squared_error(y_true, y_scores)
187
+ r2 = r2_score(y_true, y_scores)
188
+
189
+ y_pred = [1 if s > 0.3 else -1 if s < -0.3 else 0 for s in y_scores]
190
+ y_true_classes = [int(round(s)) for s in y_true]
191
+
192
+ acc = accuracy_score(y_true_classes, y_pred)
193
+ prec = precision_score(y_true_classes, y_pred, average='weighted', zero_division=0)
194
+ rec = recall_score(y_true_classes, y_pred, average='weighted')
195
+ f1 = f1_score(y_true_classes, y_pred, average='weighted')
196
+ kappa = cohen_kappa_score(y_true_classes, y_pred)
197
+ cm = confusion_matrix(y_true_classes, y_pred)
198
+
199
+ y_true_bin = label_binarize(y_true_classes, classes=[-1, 0, 1])
200
+ y_score_bin = label_binarize(y_pred, classes=[-1, 0, 1])
201
+ roc_auc = roc_auc_score(y_true_bin, y_score_bin, average='macro', multi_class='ovo')
202
+
203
+ print(f"Sentiment Regression Metrics:")
204
+ print(f"- MSE: {mse:.4f}")
205
+ print(f"- R²: {r2:.4f}")
206
+ print(f"- Accuracy: {acc:.4f}")
207
+ print(f"- Precision: {prec:.4f}")
208
+ print(f"- Recall: {rec:.4f}")
209
+ print(f"- F1 Score: {f1:.4f}")
210
+ print(f"- ROC-AUC: {roc_auc:.4f}")
211
+ print(f"- Cohen's Kappa: {kappa:.4f}")
212
+ print(f"- Confusion Matrix:\n{cm}")
213
+
214
+
215
+ def test(model_path):
216
+ # Set device
217
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
218
+ print(f"Using device: {device}")
219
+
220
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
221
+ model = TinyFinBERTRegressor()
222
+ model.load_state_dict(torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location=device))
223
+ model.to(device)
224
+ model.eval()
225
+
226
+ texts = [
227
+ "The company's earnings exceeded expectations.",
228
+ "They faced major losses this quarter.",
229
+ "They didn't face major losses this quarter.",
230
+ "Stock prices remained the same.",
231
+ "boost",
232
+ "strong boost",
233
+ "AMD was not able to reduce losses.",
234
+ "AMD reduced debt significantly, improves balance sheet",
235
+ "Economic indicators point to contraction in telecom sector",
236
+ "Company didn't have increased losses over last years."
237
+ ]
238
+
239
+ for text in texts:
240
+ clean_text = preprocess_texts([text])[0]
241
+ print(f"Original Text: {text}")
242
+ print(f"Processed Text: {clean_text}")
243
+
244
+ tokens = tokenizer.tokenize(clean_text)
245
+ print(f"Tokens: {tokens}")
246
+
247
+ inputs = tokenizer(clean_text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
248
+ inputs = {k: v.to(device) for k, v in inputs.items() if k != "token_type_ids"}
249
+
250
+ with torch.no_grad():
251
+ score = model(**inputs)["score"].item()
252
+
253
+ print(f"Predicted Sentiment Score: {score:.3f}")
254
+ sentiment = "positive" if score > 0.3 else "negative" if score < -0.3 else "neutral"
255
+ print(f"Sentiment: {sentiment}\n")
256
+
257
+
258
+ def init_model():
259
+ """Function to properly initialize model with position_ids regardless of whether it's being loaded or created new"""
260
+ model = TinyFinBERTRegressor()
261
+
262
+ # Make sure position_ids is registered
263
+ if not hasattr(model.bert.embeddings, 'position_ids'):
264
+ model.bert.embeddings.register_buffer(
265
+ "position_ids",
266
+ torch.arange(512).expand((1, -1)),
267
+ persistent=False,
268
+ )
269
+ return model
270
+
271
+
272
+ def create_api_model(model_path):
273
+ """Create a model suitable for a FastAPI application"""
274
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
275
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
276
+
277
+ # Initialize model with position_ids properly registered
278
+ model = init_model()
279
+ model.load_state_dict(torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location=device))
280
+ model.to(device)
281
+ model.eval()
282
+
283
+ return model, tokenizer, device
284
+
285
+
286
+ if __name__ == "__main__":
287
+ model_dir = "./saved_model"
288
+ phrase_path = "./Sentences_50Agree.txt"
289
+ words_path = "./financial_sentiment_words_phrases_negations.csv"
290
+
291
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
292
+ print(f"Using device: {device}")
293
+
294
+ if not os.path.isfile(os.path.join(model_dir, "pytorch_model.bin")):
295
+ print("Training new model...")
296
+ train_model(phrase_path, words_path, model_dir)
297
+ else:
298
+ print(f"Model found at {os.path.join(model_dir, 'pytorch_model.bin')}")
299
+
300
+ evaluate_model(phrase_path, model_dir)
301
+ test(model_dir)