ecroatt commited on
Commit
9eced06
·
verified ·
1 Parent(s): c26665e

Initial push from Colab

Browse files
Files changed (6) hide show
  1. README.md +13 -0
  2. config.json +8 -0
  3. inference.py +35 -0
  4. lstm_model.py +26 -0
  5. pytorch_model.bin +3 -0
  6. vocab.pkl +3 -0
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IMDB Bi-LSTM Sentiment Classifier
2
+
3
+ Two-layer bidirectional LSTM trained from scratch on the 25 000-review IMDB movie-review dataset.
4
+
5
+ * **Sequence cap:** 500 tokens
6
+ * **Hidden size:** 256 · 2 directions · 2 layers
7
+ * **Validation accuracy:** ~0.88
8
+ * **AUC:** 0.94
9
+
10
+ ```python
11
+ from inference import predict
12
+ print(predict("Terrific cast and a heart-warming story!"))
13
+ # ➜ 0.96 (positive)
config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 5000,
3
+ "pad_len": 500,
4
+ "embed_dim": 256,
5
+ "hidden_dim": 256,
6
+ "n_layers": 2,
7
+ "bidirectional": true
8
+ }
inference.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch, pickle, json, string, nltk
3
+ from pathlib import Path
4
+ from lstm_model import LSTMClassifier
5
+
6
+ PAD = 0
7
+ UNK = 1
8
+ ROOT = Path(__file__).resolve().parent
9
+
10
+ cfg = json.load(open(ROOT/'config.json'))
11
+ vocab = pickle.load(open(ROOT/'vocab.pkl', 'rb'))
12
+
13
+ model = LSTMClassifier(**cfg).eval()
14
+ model.load_state_dict(torch.load(ROOT/'pytorch_model.bin', map_location='cpu'))
15
+
16
+ nltk.download('stopwords', quiet=True)
17
+ STOP = set(nltk.corpus.stopwords.words('english'))
18
+ PUNC = str.maketrans('', '', string.punctuation)
19
+
20
+ def preprocess(text):
21
+ text = text.lower().translate(PUNC)
22
+ toks = [w for w in text.split() if w not in STOP]
23
+ return toks[: cfg['pad_len']]
24
+
25
+ def encode(tokens):
26
+ ids = [vocab.get(w, UNK) for w in tokens]
27
+ ids += [PAD] * (cfg['pad_len'] - len(ids))
28
+ return torch.tensor(ids).unsqueeze(0), torch.tensor([len(tokens)])
29
+
30
+ @torch.no_grad()
31
+ def predict(text):
32
+ x, length = encode(preprocess(text))
33
+ logit = model(x, length)
34
+ prob = torch.sigmoid(logit).item()
35
+ return prob # 0-1, >0.5 → positive
lstm_model.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+ class LSTMClassifier(nn.Module):
4
+ def __init__(self, vocab_size, embed_dim=256, hidden_dim=256,
5
+ n_layers=2, dropout=0.3, bidirectional=True):
6
+ super().__init__()
7
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
8
+ self.lstm = nn.LSTM(embed_dim,
9
+ hidden_dim,
10
+ n_layers,
11
+ batch_first=True,
12
+ dropout=dropout,
13
+ bidirectional=bidirectional)
14
+ self.bi = 2 if bidirectional else 1
15
+ self.fc = nn.Linear(hidden_dim * self.bi, 1)
16
+
17
+ def forward(self, x, lengths):
18
+ x = self.embedding(x)
19
+ packed = nn.utils.rnn.pack_padded_sequence(
20
+ x, lengths.cpu(), batch_first=True, enforce_sorted=False)
21
+ _, (h, _) = self.lstm(packed)
22
+ if self.bi == 2:
23
+ h = torch.cat((h[-2], h[-1]), dim=1) # concat fwd+rev
24
+ else:
25
+ h = h[-1]
26
+ return self.fc(h).squeeze(1)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f09b68978d9462372521b7cf88a15b8c6a969b6d4a963b2f727c6c5c49ea0931
3
+ size 15644160
vocab.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6019ffd7b964344956bb170b8d89c7e75d8b10b310a518afbebb1a0755ebfb12
3
+ size 57712