Upload model

Browse files

Files changed (8) hide show

.gitattributes +1 -0
README.md +164 -19
config.json +102 -0
generation_config.json +8 -0
model.safetensors +3 -0
special_tokens_map.json +40 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,27 +1,172 @@
 ---
-license: apache-2.0
 language:
-- hi
-- en
-- ta
-- te
-- bn
-- gu
-- kn
-- ml
-- mr
-- pa
-- or
-- as
 tags:
-- punctuation
-- indic-languages
-- cadence
 datasets:
 - ai4bharat/sangraha
 - HuggingFaceFW/fineweb-2
 metrics:
 - f1
-base_model:
-- google/gemma-3-1b-pt
----

 ---
 language:
+  - en
+  - as # Assamese
+  - bn # Bengali
+  - brx # Bodo
+  - doi # Dogri
+  - gu # Gujarati
+  - hi # Hindi
+  - kn # Kannada
+  - ks # Kashmiri
+  - kok # Konkani
+  - mai # Maithili
+  - ml # Malayalam
+  - mni # Manipuri (Meitei Mayek script requires transcription to Bengali script for tokenizer)
+  - mr # Marathi
+  - ne # Nepali
+  - or # Odia
+  - pa # Punjabi
+  - sa # Sanskrit
+  - sat # Santali
+  - sd # Sindhi
+  - ta # Tamil
+  - te # Telugu
+  - ur # Urdu
+license: mit
 tags:
+  - punctuation-restoration
+  - multilingual
+  - indic-languages
+  - ai4bharat
 datasets:
 - ai4bharat/sangraha
 - HuggingFaceFW/fineweb-2
 metrics:
 - f1
+pipeline_tag: token-classification
+library_name: cadence-punctuation
+base_model: google/gemma-3-1b-it
+widget:
+  - text: "hello world how are you today"
+    example_title: "English Punctuation"
+  - text: "यह एक हिंदी वाक्य है"
+    example_title: "Hindi Punctuation"
+  - text: "cadence is a great model for punctuation"
+    example_title: "Another English Example"
+---
+# Cadence
+A multilingual punctuation restoration model based on Gemma-3-1b
+## Features
+- **Multilingual Support**: English + 22 Indic languages
+- **Script-Aware**: Handles multiple scripts with appropriate punctuation rules
+- **Unimodel**: A single model for punctuations (doesn't require language identifier)
+- **Encoder**: Bi-directional encoder (blazing fast)
+- **Efficient Processing**: Supports batch processing and sliding window for long texts
+- **AutoModel Compatible**: Easy integration with Hugging Face ecosystem
+## Installation
+```bash
+pip install cadence-punctuation
+```
+## Quick Start
+### Using the Simple Interface
+```python
+from cadence-punctuation import PunctuationModel
+# Load model (local path or Hugging Face model ID)
+model = PunctuationModel("path/to/download/weights")
+# Punctuate single text
+text = "hello world how are you today"
+result = model.punctuate([text])
+print(result[0])  # "Hello world, how are you today?"
+# Punctuate multiple texts
+texts = [
+    "hello world how are you",
+    "this is another test sentence",
+    "यह एक हिंदी वाक्य है"  # Hindi example
+]
+results = model.punctuate(texts, batch_size=8)
+for original, punctuated in zip(texts, results):
+    print(f"Original: {original}")
+    print(f"Punctuated: {punctuated}")
+    print()
+```
+### Using AutoModel
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+# Load model and tokenizer
+model_name = "ai4bharat/Cadence"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+# Prepare input
+text = "hello world how are you"
+inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+# Get predictions
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = torch.argmax(outputs.logits, dim=-1)
+print(predictions)
+```
+## Officially Supported Languages
+- English, Assamese, Bengali, Bodo, Dogri, Gujarati, Hindi, Kannada, Kashmiri, Konkani, Maithili, Malayalam, Manipuri, Marathi, Nepali, Odia, Punjabi, Sanskrit, Santali, Sindhi, Tamil, Telugu, Urdu
+Tokenizer doesn't support Manipuri's Meitei script. The model can punctuate if the text is transcribed it to Bengali's script.
+One can try using this model for languages not listed above. Performance may vary.
+## Supported Punctuation
+The model can predict the following punctuation marks:
+- Period (.)
+- Comma (,)
+- Question mark (?)
+- Exclamation mark (!)
+- Semicolon (;)
+- Colon (:)
+- Hyphen (-)
+- Quotes (" and ')
+- Ellipse (...)
+- Parentheses ()
+- Hindi Danda (।)
+- Urdu punctuation (۔، ؟)
+- Arabic punctuation (٬ ،)
+- Santali punctuation (᱾ ᱾।)
+- Sanskrit punctuation (॥)
+- And various combinations
+## Configuration Options
+### PunctuationModel Parameters
+- `model_path`: Local path to download the weights
+- `gpu_id`: GPU device ID (None for auto-detection)
+- `cpu`: Force CPU usage (default: False)
+- `max_length`: Maximum sequence length (default: 300)
+- `sliding_window`: Enable sliding window for long texts (default: True)
+- `verbose`: Enable verbose logging (default: False)
+- `d_type`: Precision with which weights are loaded (default: bfloat16)
+```python
+# Custom configuration
+model = PunctuationModel(
+    model_path="path/to/download/weights",
+    gpu_id=0,  # Use specific GPU
+    max_length=512,  # Longer sequences
+    sliding_window=True,  # Handle long texts
+    verbose=False,  # Quiet mode
+    d_type="bfloat16"
+)
+# Process long texts with sliding window
+long_text = "Your very long text here..." * 100
+result = model.punctuate([long_text])
+```

config.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+  "architectures": [
+    "Gemma3ForTokenClassification"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "classifier_dropout_prob": 0.0,
+  "eos_token_id": 1,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "id2label": {
+    "0": "O",
+    "1": ".",
+    "10": "\"",
+    "11": "\u0964",
+    "12": "(",
+    "13": ")",
+    "14": ":",
+    "15": "\u066c",
+    "16": "\u06d4",
+    "17": "\u061f",
+    "18": ".\"",
+    "19": ").",
+    "2": ",",
+    "20": "),",
+    "21": "\",",
+    "22": "\".",
+    "23": "?\"",
+    "24": "\"?",
+    "25": "\u0964\"",
+    "26": "\"\u0964",
+    "27": "\u060c",
+    "28": "\u1c7e",
+    "29": "\u0965",
+    "3": "?",
+    "30": "\u1c7e\u0964",
+    "4": "-",
+    "5": ";",
+    "6": "_",
+    "7": "!",
+    "8": "'",
+    "9": "..."
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 6912,
+  "label2id": {
+    "!": 7,
+    "\"": 10,
+    "\",": 21,
+    "\".": 22,
+    "\"?": 24,
+    "\"\u0964": 26,
+    "'": 8,
+    "(": 12,
+    ")": 13,
+    "),": 20,
+    ").": 19,
+    ",": 2,
+    "-": 4,
+    ".": 1,
+    ".\"": 18,
+    "...": 9,
+    ":": 14,
+    ";": 5,
+    "?": 3,
+    "?\"": 23,
+    "O": 0,
+    "_": 6,
+    "\u060c": 27,
+    "\u061f": 17,
+    "\u066c": 15,
+    "\u06d4": 16,
+    "\u0964": 11,
+    "\u0964\"": 25,
+    "\u0965": 29,
+    "\u1c7e": 28,
+    "\u1c7e\u0964": 30
+  },
+  "max_position_embeddings": 32768,
+  "model_type": "cadence_punctuation",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 1,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": 512,
+  "sliding_window_pattern": 6,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_non_causal_attention": true,
+  "vocab_size": 262146
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.51.3"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fd66fdcd492b575773877dbaa9a113a5f710d8a79020cfb26c35927dcfc9142
+size 3999735300

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88ec6df915623f4b307188dbb6fe60ddb8a1ef273c864ba38de97a320dd17dea
+size 33384751

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff