How to use

#1
by doublesizebed - opened

import fasttext
import string

1. Load your trained FastText LID model

model = fasttext.load_model("lid_ms_en.bin") # your trained English↔Malay detector :contentReference[oaicite:0]{index=0}

def tokenize(text):
"""
Simple tokenizer:
- lowercases text
- splits on whitespace
- strips leading/trailing punctuation from each token
"""
tokens = text.lower().split() # split on any whitespace :contentReference[oaicite:1]{index=1}
# strip punctuation from ends of each token
return [t.strip(string.punctuation) for t in tokens if t.strip(string.punctuation)]

def predict_per_token(sentence):
"""
Given a full sentence, return a list of (token, LANG) tuples
"""
preds = []
for token in tokenize(sentence):
label, _ = model.predict(token) # returns (['__label__ms'], [0.98]) :contentReference[oaicite:2]{index=2}
lang = label[0].replace("label", "").upper()
preds.append((token, lang))
return preds

Example usage

input_sentence = "Saya suka chicken and fish pda hari Isnin!"
print(predict_per_token(input_sentence))

Sign up or log in to comment