Rekhta Lab Logo

Hindi to Urdu Transliteration Model (Character-Level)

This is a lightweight Transformer-based model trained for character-level transliteration of Hindi poetry into Urdu script. The model is specially tuned for literary and poetic text, making it ideal for applications involving shayari, nazm, or ghazals.

Live Inference

https://rekhtalabs.org/demo/transliterate

Model Overview

Feature Value
Architecture Transformer (BART-style)
Tokenizer Character-level
Embedding Size 256
Hidden Size 256 (d_model)
Feedforward Size 512 (dim_feedforward)
Encoder Layers 3 (num_layers)
Decoder Layers 3 (num_layers)
Attention Heads 4 (nhead)
Max Sequence Length 128 (max_len)

Usage

from huggingface_hub import snapshot_download

path = snapshot_download(
    repo_id="rekhtalabs/hi-2-ur-translit",
    local_dir="./hi-2-ur-translit",
    local_dir_use_symlinks=False
)

cd hi-2-ur-translit
pip install -r requirements.txt
import torch
import sentencepiece as spm
from torch import nn
from collections import OrderedDict


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)


class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=4, num_layers=3, dim_feedforward=512, max_len=128):
        super().__init__()
        self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            batch_first=True
        )
        self.out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src = self.pos_encoder(self.src_tok_emb(src))
        tgt = self.pos_encoder(self.tgt_tok_emb(tgt))
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device)
        out = self.transformer(src, tgt, tgt_mask=tgt_mask)
        return self.out(out)


sp_nastaaliq = spm.SentencePieceProcessor(model_file='nastaaliq_bpe.model')
sp_devanagari = spm.SentencePieceProcessor(model_file='devanagari_bpe.model')


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(
    src_vocab_size=sp_devanagari.get_piece_size(),
    tgt_vocab_size=sp_nastaaliq.get_piece_size()
).to(device)


checkpoint = torch.load("h2u_2.0.pt", map_location=device)
state_dict = checkpoint["model_state_dict"]
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    new_k = k.replace("module.", "")
    new_state_dict[new_k] = v
model.load_state_dict(new_state_dict)
model.eval()


def transliterate_urdu_to_hindi(text_urdu, max_len=128):
    src_ids = [2] + sp_devanagari.encode(text_urdu)[:max_len - 2] + [3] 
    src_tensor = torch.tensor(src_ids).unsqueeze(0).to(device)

    tgt_ids = [2]  # BOS token
    for _ in range(max_len):
        tgt_tensor = torch.tensor(tgt_ids).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(src_tensor, tgt_tensor)
            next_token_logits = output[0, -1, :]
            next_token_id = torch.argmax(next_token_logits).item()

        if next_token_id == 3:  
            break
        tgt_ids.append(next_token_id)

    return sp_nastaaliq.decode(tgt_ids[1:])  


res=transliterate_urdu_to_hindi("थम गए हों बहते बहते चम्पई रुख़्सार पर")
print(res)


Output

تھم گئے ہوں بہتے بہتے چمپئی رخسار پر

Dataset

  • Trained on approximately 1300,000 Hindi-Urdu Ghazal and Nazm Pairs
  • Sourced and curated for transliteration.
  • Character-level alignment ensured for quality

Downloads last month
-
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support