Initial push from Colab

Files changed (6) hide show

README.md ADDED Viewed

+# IMDB Bi-LSTM Sentiment Classifier
+Two-layer bidirectional LSTM trained from scratch on the 25 000-review IMDB movie-review dataset.
+* **Sequence cap:** 500 tokens
+* **Hidden size:** 256 · 2 directions · 2 layers
+* **Validation accuracy:** ~0.88
+* **AUC:** 0.94
+```python
+from inference import predict
+print(predict("Terrific cast and a heart-warming story!"))
+# ➜ 0.96  (positive)

config.json ADDED Viewed

+{
+  "vocab_size": 5000,
+  "pad_len": 500,
+  "embed_dim": 256,
+  "hidden_dim": 256,
+  "n_layers": 2,
+  "bidirectional": true
+}

inference.py ADDED Viewed

+import torch, pickle, json, string, nltk
+from pathlib import Path
+from lstm_model import LSTMClassifier
+PAD = 0
+UNK = 1
+ROOT = Path(__file__).resolve().parent
+cfg   = json.load(open(ROOT/'config.json'))
+vocab = pickle.load(open(ROOT/'vocab.pkl', 'rb'))
+model = LSTMClassifier(**cfg).eval()
+model.load_state_dict(torch.load(ROOT/'pytorch_model.bin', map_location='cpu'))
+nltk.download('stopwords', quiet=True)
+STOP = set(nltk.corpus.stopwords.words('english'))
+PUNC = str.maketrans('', '', string.punctuation)
+def preprocess(text):
+    text = text.lower().translate(PUNC)
+    toks = [w for w in text.split() if w not in STOP]
+    return toks[: cfg['pad_len']]
+def encode(tokens):
+    ids = [vocab.get(w, UNK) for w in tokens]
+    ids += [PAD] * (cfg['pad_len'] - len(ids))
+    return torch.tensor(ids).unsqueeze(0), torch.tensor([len(tokens)])
+@torch.no_grad()
+def predict(text):
+    x, length = encode(preprocess(text))
+    logit = model(x, length)
+    prob  = torch.sigmoid(logit).item()
+    return prob   # 0-1, >0.5 → positive

lstm_model.py ADDED Viewed

+import torch.nn as nn
+class LSTMClassifier(nn.Module):
+    def __init__(self, vocab_size, embed_dim=256, hidden_dim=256,
+                 n_layers=2, dropout=0.3, bidirectional=True):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
+        self.lstm = nn.LSTM(embed_dim,
+                            hidden_dim,
+                            n_layers,
+                            batch_first=True,
+                            dropout=dropout,
+                            bidirectional=bidirectional)
+        self.bi = 2 if bidirectional else 1
+        self.fc = nn.Linear(hidden_dim * self.bi, 1)
+    def forward(self, x, lengths):
+        x = self.embedding(x)
+        packed = nn.utils.rnn.pack_padded_sequence(
+            x, lengths.cpu(), batch_first=True, enforce_sorted=False)
+        _, (h, _) = self.lstm(packed)
+        if self.bi == 2:
+            h = torch.cat((h[-2], h[-1]), dim=1)  # concat fwd+rev
+        else:
+            h = h[-1]
+        return self.fc(h).squeeze(1)

pytorch_model.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:f09b68978d9462372521b7cf88a15b8c6a969b6d4a963b2f727c6c5c49ea0931
+size 15644160

vocab.pkl ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:6019ffd7b964344956bb170b8d89c7e75d8b10b310a518afbebb1a0755ebfb12
+size 57712