prajdabre commited on
Commit
118dc6a
·
verified ·
1 Parent(s): 21ea539

Update config file and model.safetensors

Browse files
README.md CHANGED
@@ -1,17 +1,18 @@
1
  ---
2
  license: mit
3
  ---
4
- # Models
5
- These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
6
 
7
- *NOTE*:
8
- These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
9
 
10
- Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
11
 
 
 
12
 
 
13
 
14
  # Usage
 
15
  The usage instructions are very similar to [IndicTrans2 HuggingFace models](https://huggingface.co/collections/ai4bharat/indictrans2-664ccb91d23bbae0d681c3ca), you'll need to install the [IndicTransToolkit](https://github.com/VarunGumma/IndicTransToolkit) to use the `IndicProcessor` module to pre-process the source texts before generating translations.
16
 
17
  ```python
@@ -64,19 +65,25 @@ print(" | > Translations:", outputs[0])
64
  ```
65
 
66
  # Citation
 
67
  If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
68
 
69
  ```bibtex
70
  @misc{gumma2025inducinglongcontextabilitiesmultilingual,
71
- title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
72
  author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
73
  year={2025},
74
  eprint={2408.11382},
75
  archivePrefix={arXiv},
76
  primaryClass={cs.CL},
77
- url={https://arxiv.org/abs/2408.11382},
78
  }
79
  ```
80
 
 
 
 
 
81
  # Warning
82
- Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
 
 
1
  ---
2
  license: mit
3
  ---
 
 
4
 
5
+ # Models
 
6
 
7
+ These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
8
 
9
+ _NOTE_:
10
+ These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
11
 
12
+ Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
13
 
14
  # Usage
15
+
16
  The usage instructions are very similar to [IndicTrans2 HuggingFace models](https://huggingface.co/collections/ai4bharat/indictrans2-664ccb91d23bbae0d681c3ca), you'll need to install the [IndicTransToolkit](https://github.com/VarunGumma/IndicTransToolkit) to use the `IndicProcessor` module to pre-process the source texts before generating translations.
17
 
18
  ```python
 
65
  ```
66
 
67
  # Citation
68
+
69
  If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
70
 
71
  ```bibtex
72
  @misc{gumma2025inducinglongcontextabilitiesmultilingual,
73
+ title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
74
  author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
75
  year={2025},
76
  eprint={2408.11382},
77
  archivePrefix={arXiv},
78
  primaryClass={cs.CL},
79
+ url={https://arxiv.org/abs/2408.11382},
80
  }
81
  ```
82
 
83
+ # Note
84
+
85
+ These new and improved models are primarily built and tested for document-level and long-context translations, and the performance of smaller sentence-level tasks might be sub-optimal, and might require generation parameter tuning. Please throughly verify the performance of the models for your usecase before scaling up generation.
86
+
87
  # Warning
88
+
89
+ Occasionally, you may notice some variation in the output, which may not be optimal. In such cases, you can experiment with adjusting the `num_beams`, `repetition_penalty`, and `length_penalty` parameters in the `generation_config`. Based on standard testing, the example with an input size of 1457 can be run on a single A100 GPU. However, the 1B model might require more compute resources or a lower beam size for generation.
config.json CHANGED
@@ -10,7 +10,7 @@
10
  "decoder_attention_heads": 8,
11
  "decoder_embed_dim": 512,
12
  "decoder_ffn_dim": 2048,
13
- "decoder_layerdrop": 0,
14
  "decoder_layers": 18,
15
  "decoder_normalize_before": true,
16
  "decoder_start_token_id": 2,
@@ -19,7 +19,7 @@
19
  "encoder_attention_heads": 8,
20
  "encoder_embed_dim": 512,
21
  "encoder_ffn_dim": 2048,
22
- "encoder_layerdrop": 0,
23
  "encoder_layers": 18,
24
  "encoder_normalize_before": true,
25
  "encoder_vocab_size": 122706,
@@ -27,21 +27,18 @@
27
  "init_std": 0.02,
28
  "is_encoder_decoder": true,
29
  "layernorm_embedding": true,
 
 
30
  "model_type": "RotaryIndicTrans",
31
  "num_hidden_layers": 18,
32
  "pad_token_id": 1,
33
  "rope_args": {
34
- "theta": 10000
35
  },
36
  "scale_embedding": true,
37
  "share_decoder_input_output_embed": true,
38
  "torch_dtype": "float32",
39
- "transformers_version": "4.44.0",
40
  "use_cache": true,
41
- "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
42
- "auto_map": {
43
- "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
44
- "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
45
- },
46
- "tokenizer_class": "IndicTransTokenizer"
47
- }
 
10
  "decoder_attention_heads": 8,
11
  "decoder_embed_dim": 512,
12
  "decoder_ffn_dim": 2048,
13
+ "decoder_layerdrop": 0.0,
14
  "decoder_layers": 18,
15
  "decoder_normalize_before": true,
16
  "decoder_start_token_id": 2,
 
19
  "encoder_attention_heads": 8,
20
  "encoder_embed_dim": 512,
21
  "encoder_ffn_dim": 2048,
22
+ "encoder_layerdrop": 0.0,
23
  "encoder_layers": 18,
24
  "encoder_normalize_before": true,
25
  "encoder_vocab_size": 122706,
 
27
  "init_std": 0.02,
28
  "is_encoder_decoder": true,
29
  "layernorm_embedding": true,
30
+ "max_source_positions": 8192,
31
+ "max_target_positions": 8192,
32
  "model_type": "RotaryIndicTrans",
33
  "num_hidden_layers": 18,
34
  "pad_token_id": 1,
35
  "rope_args": {
36
+ "theta": 50000
37
  },
38
  "scale_embedding": true,
39
  "share_decoder_input_output_embed": true,
40
  "torch_dtype": "float32",
41
+ "transformers_version": "4.47.1",
42
  "use_cache": true,
43
+ "vocab_size": 32296
44
+ }
 
 
 
 
 
generation_config.json CHANGED
@@ -4,5 +4,5 @@
4
  "decoder_start_token_id": 2,
5
  "eos_token_id": 2,
6
  "pad_token_id": 1,
7
- "transformers_version": "4.44.0"
8
  }
 
4
  "decoder_start_token_id": 2,
5
  "eos_token_id": 2,
6
  "pad_token_id": 1,
7
+ "transformers_version": "4.47.1"
8
  }
pytorch_model.bin → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce8759926f47d14d23a7f8e84f1da94e0c71026cedabdbe62d0531315254c519
3
- size 847373378
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b96a2591892dbc970c540b25e70c02ac93b17d63a3d7e5bb39efa9c19f3f36f3
3
+ size 847211256
modeling_rotary_indictrans.py CHANGED
@@ -108,7 +108,9 @@ class RotaryEmbedding(torch.nn.Module):
108
  self.max_seq_len = max_seq_len
109
  self.scaling_factor = scaling_factor
110
 
111
- inv_freq_ = 1.0 / (theta ** (torch.arange(0, dim, 2, device=device).float() / dim))
 
 
112
 
113
  self.register_buffer("inv_freq", inv_freq_, persistent=False)
114
  self.precompute_freqs(max_seq_len)
 
108
  self.max_seq_len = max_seq_len
109
  self.scaling_factor = scaling_factor
110
 
111
+ inv_freq_ = 1.0 / (
112
+ theta ** (torch.arange(0, dim, 2, device=device).float() / dim)
113
+ )
114
 
115
  self.register_buffer("inv_freq", inv_freq_, persistent=False)
116
  self.precompute_freqs(max_seq_len)
tokenizer_config.json CHANGED
@@ -37,7 +37,7 @@
37
  "clean_up_tokenization_spaces": true,
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
- "model_max_length": 4096,
41
  "pad_token": "<pad>",
42
  "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
43
  "tokenizer_class": "IndicTransTokenizer",
 
37
  "clean_up_tokenization_spaces": true,
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
+ "model_max_length": 8192,
41
  "pad_token": "<pad>",
42
  "name_or_path": "prajdabre/rotary-indictrans2-indic-en-dist-200M",
43
  "tokenizer_class": "IndicTransTokenizer",