Update config file and model.safetensors

Browse files

Files changed (6) hide show

README.md +15 -8
config.json +8 -11
generation_config.json +1 -1
pytorch_model.bin → model.safetensors +2 -2
modeling_rotary_indictrans.py +3 -1
tokenizer_config.json +1 -1

README.md CHANGED Viewed

@@ -1,17 +1,18 @@
 ---
 license: mit
 ---
-# Models
-These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
-*NOTE*:
-These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
-Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
 # Usage
 The usage instructions are very similar to [IndicTrans2 HuggingFace models](https://huggingface.co/collections/ai4bharat/indictrans2-664ccb91d23bbae0d681c3ca), you'll need to install the [IndicTransToolkit](https://github.com/VarunGumma/IndicTransToolkit) to use the `IndicProcessor` module to pre-process the source texts before generating translations.
 ```python
@@ -64,19 +65,25 @@ print(" | > Translations:", outputs[0])
 ```
 # Citation
 If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
 ```bibtex
 @misc{gumma2025inducinglongcontextabilitiesmultilingual,
-      title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
       author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
       year={2025},
       eprint={2408.11382},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2408.11382},
 }
 ```
 # Warning
-Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.

 ---
 license: mit
 ---
+# Models
+These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
+_NOTE_:
+These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
+Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
 # Usage
 The usage instructions are very similar to [IndicTrans2 HuggingFace models](https://huggingface.co/collections/ai4bharat/indictrans2-664ccb91d23bbae0d681c3ca), you'll need to install the [IndicTransToolkit](https://github.com/VarunGumma/IndicTransToolkit) to use the `IndicProcessor` module to pre-process the source texts before generating translations.
 ```python
 ```
 # Citation
 If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
 ```bibtex
 @misc{gumma2025inducinglongcontextabilitiesmultilingual,
+      title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
       author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
       year={2025},
       eprint={2408.11382},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2408.11382},
 }
 ```
+# Note
+These new and improved models are primarily built and tested for document-level and long-context translations, and the performance of smaller sentence-level tasks might be sub-optimal, and might require generation parameter tuning. Please throughly verify the performance of the models for your usecase before scaling up generation.
 # Warning
+Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.

config.json CHANGED Viewed

@@ -10,7 +10,7 @@
   "decoder_attention_heads": 8,
   "decoder_embed_dim": 512,
   "decoder_ffn_dim": 2048,
-  "decoder_layerdrop": 0,
   "decoder_layers": 18,
   "decoder_normalize_before": true,
   "decoder_start_token_id": 2,
@@ -19,7 +19,7 @@
   "encoder_attention_heads": 8,
   "encoder_embed_dim": 512,
   "encoder_ffn_dim": 2048,
-  "encoder_layerdrop": 0,
   "encoder_layers": 18,
   "encoder_normalize_before": true,
   "encoder_vocab_size": 122706,
@@ -27,21 +27,18 @@
   "init_std": 0.02,
   "is_encoder_decoder": true,
   "layernorm_embedding": true,
   "model_type": "RotaryIndicTrans",
   "num_hidden_layers": 18,
   "pad_token_id": 1,
   "rope_args": {
-    "theta": 10000
   },
   "scale_embedding": true,
   "share_decoder_input_output_embed": true,
   "torch_dtype": "float32",
-  "transformers_version": "4.44.0",
   "use_cache": true,
-  "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
-  "auto_map": {
-    "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
-    "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
-  },
-  "tokenizer_class": "IndicTransTokenizer"
-}

   "decoder_attention_heads": 8,
   "decoder_embed_dim": 512,
   "decoder_ffn_dim": 2048,
+  "decoder_layerdrop": 0.0,
   "decoder_layers": 18,
   "decoder_normalize_before": true,
   "decoder_start_token_id": 2,
   "encoder_attention_heads": 8,
   "encoder_embed_dim": 512,
   "encoder_ffn_dim": 2048,
+  "encoder_layerdrop": 0.0,
   "encoder_layers": 18,
   "encoder_normalize_before": true,
   "encoder_vocab_size": 122706,
   "init_std": 0.02,
   "is_encoder_decoder": true,
   "layernorm_embedding": true,
+  "max_source_positions": 8192,
+  "max_target_positions": 8192,
   "model_type": "RotaryIndicTrans",
   "num_hidden_layers": 18,
   "pad_token_id": 1,
   "rope_args": {
+    "theta": 50000
   },
   "scale_embedding": true,
   "share_decoder_input_output_embed": true,
   "torch_dtype": "float32",
+  "transformers_version": "4.47.1",
   "use_cache": true,
+  "vocab_size": 32296
+}

generation_config.json CHANGED Viewed

@@ -4,5 +4,5 @@
   "decoder_start_token_id": 2,
   "eos_token_id": 2,
   "pad_token_id": 1,
-  "transformers_version": "4.44.0"
 }

   "decoder_start_token_id": 2,
   "eos_token_id": 2,
   "pad_token_id": 1,
+  "transformers_version": "4.47.1"
 }

pytorch_model.bin → model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce8759926f47d14d23a7f8e84f1da94e0c71026cedabdbe62d0531315254c519
-size 847373378

 version https://git-lfs.github.com/spec/v1
+oid sha256:b96a2591892dbc970c540b25e70c02ac93b17d63a3d7e5bb39efa9c19f3f36f3
+size 847211256

modeling_rotary_indictrans.py CHANGED Viewed

@@ -108,7 +108,9 @@ class RotaryEmbedding(torch.nn.Module):
         self.max_seq_len = max_seq_len
         self.scaling_factor = scaling_factor
-        inv_freq_ = 1.0 / (theta ** (torch.arange(0, dim, 2, device=device).float() / dim))
         self.register_buffer("inv_freq", inv_freq_, persistent=False)
         self.precompute_freqs(max_seq_len)

         self.max_seq_len = max_seq_len
         self.scaling_factor = scaling_factor
+        inv_freq_ = 1.0 / (
+            theta ** (torch.arange(0, dim, 2, device=device).float() / dim)
+        )
         self.register_buffer("inv_freq", inv_freq_, persistent=False)
         self.precompute_freqs(max_seq_len)

tokenizer_config.json CHANGED Viewed

@@ -37,7 +37,7 @@
   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "eos_token": "</s>",
-  "model_max_length": 4096,
   "pad_token": "<pad>",
   "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
   "tokenizer_class": "IndicTransTokenizer",

   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "eos_token": "</s>",
+  "model_max_length": 8192,
   "pad_token": "<pad>",
   "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
   "tokenizer_class": "IndicTransTokenizer",