Rekhta Lab Logo

Urdu to Hindi Transliteration Model (Character-Level)

This is a lightweight Transformer-based model trained for character-level transliteration of Urdu poetry into Hindi script. The model is specially tuned for literary and poetic text, making it ideal for applications involving shayari, nazm, or ghazals.

Live Inference

https://rekhtalabs.org/demo/transliterate

Model Overview

Feature Value
Architecture Transformer (BART-style)
Tokenizer Character-level
Total Parameters 4M
Source Vocab Size 87 (Urdu characters)
Target Vocab Size 109 (Hindi characters)
Embedding Size 256
Hidden Size 256 (d_model)
Feedforward Size 512
Encoder Layers 3
Decoder Layers 3
Attention Heads 4
Max Sequence Length 128 characters

Usage

from huggingface_hub import snapshot_download

path = snapshot_download(
    repo_id="rekhtalabs/ur-2-hi-translit",
    local_dir="./ur-2-hi-translit",
    local_dir_use_symlinks=False
)

cd ur-2-hi-translit
pip install -r requirements.txt
import torch
import sentencepiece as spm
from torch import nn


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=4, num_layers=3, dim_feedforward=512, max_len=128):
        super().__init__()
        self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            batch_first=True
        )
        self.out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src = self.pos_encoder(self.src_tok_emb(src))
        tgt = self.pos_encoder(self.tgt_tok_emb(tgt))
        tgt_input = tgt
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_input.size(1)).to(src.device)
        out = self.transformer(src, tgt_input, tgt_mask=tgt_mask)
        return self.out(out)


device = torch.device("cpu")
sp_nastaaliq = spm.SentencePieceProcessor(model_file='nastaaliq_char.model')
sp_devanagari = spm.SentencePieceProcessor(model_file='devanagari_char.model')

model = Transformer(
    src_vocab_size=sp_nastaaliq.get_piece_size(),
    tgt_vocab_size=sp_devanagari.get_piece_size()
)
checkpoint = torch.load("transformer_transliteration_final.pt", map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
model.to(device)


def transliterate_urdu_to_hindi(text_urdu, max_len=128):
    
    src_ids = [2] + sp_nastaaliq.encode(text_urdu)[:max_len - 2] + [3]
    src_tensor = torch.tensor(src_ids).unsqueeze(0).to(device)  # shape: (1, seq_len)

    
    tgt_ids = [2]
    tgt_tensor = torch.tensor(tgt_ids).unsqueeze(0).to(device)

    for _ in range(max_len):
        output = model(src_tensor, tgt_tensor)  
        next_token_logits = output[0, -1, :]  
        next_token_id = torch.argmax(next_token_logits).item()

        if next_token_id == 3:  
            break

        tgt_ids.append(next_token_id)
        tgt_tensor = torch.tensor(tgt_ids).unsqueeze(0).to(device)

    
    return sp_devanagari.decode(tgt_ids[1:])  

res=transliterate_urdu_to_hindi("وسوسے دل میں نہ رکھ خوف رسن لے کے نہ چل")
print(res)


Output

वसवसे दिल में न रख ख़ौफ़-ए-रसन ले के न चल

Dataset

  • Trained on approximately 800,000 Urdu-Hindi sentence pairs
  • Sourced and curated for transliteration.
  • Character-level alignment ensured for quality

Downloads last month
18
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support