Initial push from Colab
Browse files- README.md +13 -0
- config.json +8 -0
- inference.py +35 -0
- lstm_model.py +26 -0
- pytorch_model.bin +3 -0
- vocab.pkl +3 -0
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IMDB Bi-LSTM Sentiment Classifier
|
2 |
+
|
3 |
+
Two-layer bidirectional LSTM trained from scratch on the 25 000-review IMDB movie-review dataset.
|
4 |
+
|
5 |
+
* **Sequence cap:** 500 tokens
|
6 |
+
* **Hidden size:** 256 · 2 directions · 2 layers
|
7 |
+
* **Validation accuracy:** ~0.88
|
8 |
+
* **AUC:** 0.94
|
9 |
+
|
10 |
+
```python
|
11 |
+
from inference import predict
|
12 |
+
print(predict("Terrific cast and a heart-warming story!"))
|
13 |
+
# ➜ 0.96 (positive)
|
config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"vocab_size": 5000,
|
3 |
+
"pad_len": 500,
|
4 |
+
"embed_dim": 256,
|
5 |
+
"hidden_dim": 256,
|
6 |
+
"n_layers": 2,
|
7 |
+
"bidirectional": true
|
8 |
+
}
|
inference.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch, pickle, json, string, nltk
|
3 |
+
from pathlib import Path
|
4 |
+
from lstm_model import LSTMClassifier
|
5 |
+
|
6 |
+
PAD = 0
|
7 |
+
UNK = 1
|
8 |
+
ROOT = Path(__file__).resolve().parent
|
9 |
+
|
10 |
+
cfg = json.load(open(ROOT/'config.json'))
|
11 |
+
vocab = pickle.load(open(ROOT/'vocab.pkl', 'rb'))
|
12 |
+
|
13 |
+
model = LSTMClassifier(**cfg).eval()
|
14 |
+
model.load_state_dict(torch.load(ROOT/'pytorch_model.bin', map_location='cpu'))
|
15 |
+
|
16 |
+
nltk.download('stopwords', quiet=True)
|
17 |
+
STOP = set(nltk.corpus.stopwords.words('english'))
|
18 |
+
PUNC = str.maketrans('', '', string.punctuation)
|
19 |
+
|
20 |
+
def preprocess(text):
|
21 |
+
text = text.lower().translate(PUNC)
|
22 |
+
toks = [w for w in text.split() if w not in STOP]
|
23 |
+
return toks[: cfg['pad_len']]
|
24 |
+
|
25 |
+
def encode(tokens):
|
26 |
+
ids = [vocab.get(w, UNK) for w in tokens]
|
27 |
+
ids += [PAD] * (cfg['pad_len'] - len(ids))
|
28 |
+
return torch.tensor(ids).unsqueeze(0), torch.tensor([len(tokens)])
|
29 |
+
|
30 |
+
@torch.no_grad()
|
31 |
+
def predict(text):
|
32 |
+
x, length = encode(preprocess(text))
|
33 |
+
logit = model(x, length)
|
34 |
+
prob = torch.sigmoid(logit).item()
|
35 |
+
return prob # 0-1, >0.5 → positive
|
lstm_model.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
|
3 |
+
class LSTMClassifier(nn.Module):
|
4 |
+
def __init__(self, vocab_size, embed_dim=256, hidden_dim=256,
|
5 |
+
n_layers=2, dropout=0.3, bidirectional=True):
|
6 |
+
super().__init__()
|
7 |
+
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
|
8 |
+
self.lstm = nn.LSTM(embed_dim,
|
9 |
+
hidden_dim,
|
10 |
+
n_layers,
|
11 |
+
batch_first=True,
|
12 |
+
dropout=dropout,
|
13 |
+
bidirectional=bidirectional)
|
14 |
+
self.bi = 2 if bidirectional else 1
|
15 |
+
self.fc = nn.Linear(hidden_dim * self.bi, 1)
|
16 |
+
|
17 |
+
def forward(self, x, lengths):
|
18 |
+
x = self.embedding(x)
|
19 |
+
packed = nn.utils.rnn.pack_padded_sequence(
|
20 |
+
x, lengths.cpu(), batch_first=True, enforce_sorted=False)
|
21 |
+
_, (h, _) = self.lstm(packed)
|
22 |
+
if self.bi == 2:
|
23 |
+
h = torch.cat((h[-2], h[-1]), dim=1) # concat fwd+rev
|
24 |
+
else:
|
25 |
+
h = h[-1]
|
26 |
+
return self.fc(h).squeeze(1)
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f09b68978d9462372521b7cf88a15b8c6a969b6d4a963b2f727c6c5c49ea0931
|
3 |
+
size 15644160
|
vocab.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6019ffd7b964344956bb170b8d89c7e75d8b10b310a518afbebb1a0755ebfb12
|
3 |
+
size 57712
|