socheatasokhachan commited on Jun 20

Commit

d0298df

verified ·

1 Parent(s): ae4b26a

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
added_tokens.json +6 -0
config.json +102 -0
generation_config.json +8 -0
khmerhomophonecorrector/.DS_Store +0 -0
khmerhomophonecorrector/.gitattributes +46 -0
khmerhomophonecorrector/FYP_ Model Tracking - Sheet1.csv +107 -0
khmerhomophonecorrector/README.md +3 -0
khmerhomophonecorrector/app.py +216 -0
khmerhomophonecorrector/batch_size_impact.png +3 -0
khmerhomophonecorrector/data/.DS_Store +0 -0
khmerhomophonecorrector/data/test.json +0 -0
khmerhomophonecorrector/data/train.json +3 -0
khmerhomophonecorrector/data/val.json +0 -0
khmerhomophonecorrector/dataset_distribution.png +0 -0
khmerhomophonecorrector/header.png +3 -0
khmerhomophonecorrector/homophone_pairs.json +3 -0
khmerhomophonecorrector/homophone_test.json +272 -0
khmerhomophonecorrector/infer_from_json.py +270 -0
khmerhomophonecorrector/khmerhomophonecorrector/added_tokens.json +6 -0
khmerhomophonecorrector/khmerhomophonecorrector/config.json +102 -0
khmerhomophonecorrector/khmerhomophonecorrector/generation_config.json +8 -0
khmerhomophonecorrector/khmerhomophonecorrector/model.safetensors +3 -0
khmerhomophonecorrector/khmerhomophonecorrector/special_tokens_map.json +21 -0
khmerhomophonecorrector/khmerhomophonecorrector/spiece.model +3 -0
khmerhomophonecorrector/khmerhomophonecorrector/tokenizer_config.json +99 -0
khmerhomophonecorrector/khmerhomophonecorrector/training_args.bin +3 -0
khmerhomophonecorrector/khmerhomophonecorrector/training_state.json +1 -0
khmerhomophonecorrector/loss_comparison.png +3 -0
khmerhomophonecorrector/metrics_comparison.png +3 -0
khmerhomophonecorrector/model_performance_line_chart.html +0 -0
khmerhomophonecorrector/model_performance_line_chart.png +0 -0
khmerhomophonecorrector/model_performance_line_chart_big_bs32_e40.html +0 -0
khmerhomophonecorrector/model_performance_line_chart_big_bs32_e40.png +0 -0
khmerhomophonecorrector/model_performance_table.html +80 -0
khmerhomophonecorrector/test_results.txt +0 -0
khmerhomophonecorrector/tool/.DS_Store +0 -0
khmerhomophonecorrector/tool/__pycache__/khnormal.cpython-312.pyc +0 -0
khmerhomophonecorrector/tool/balance_data.py +39 -0
khmerhomophonecorrector/tool/clean_data.py +52 -0
khmerhomophonecorrector/tool/combine_homophones.py +46 -0
khmerhomophonecorrector/tool/complete_homophone_sentences.py +182 -0
khmerhomophonecorrector/tool/convert_format.py +107 -0
khmerhomophonecorrector/tool/convert_training_data.py +108 -0
khmerhomophonecorrector/tool/debug_homophone_check.py +54 -0
khmerhomophonecorrector/tool/filter.py +40 -0
khmerhomophonecorrector/tool/homophone_missing.py +88 -0
khmerhomophonecorrector/tool/khnormal.py +158 -0
khmerhomophonecorrector/tool/normalize_khmer.py +52 -0
khmerhomophonecorrector/tool/segmentation.py +48 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/batch_size_impact.png filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/data/train.json filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/header.png filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/homophone_pairs.json filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/loss_comparison.png filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/metrics_comparison.png filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/training_loss.png filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/visualization/batch_size_impact.png filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/visualization/loss_comparison.png filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/visualization/metrics_comparison.png filter=lfs diff=lfs merge=lfs -text
+khmerhomophonecorrector/visualization/training_loss.png filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "</s>": 32001,
+  "<2en>": 32003,
+  "<2km>": 32002,
+  "<s>": 32000
+}

config.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "adaptor_activation_function": "gelu",
+  "adaptor_dropout": 0.1,
+  "adaptor_hidden_size": 512,
+  "adaptor_init_std": 0.02,
+  "adaptor_scaling_factor": 1.0,
+  "adaptor_tuning": false,
+  "additional_source_wait_k": -1,
+  "alibi_encoding": false,
+  "architectures": [
+    "MBartForConditionalGeneration"
+  ],
+  "asymmetric_alibi_encoding": false,
+  "attention_dropout": 0.1,
+  "bos_token_id": 32000,
+  "bottleneck_mid_fusion_tokens": 4,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_adaptor_tying_config": null,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_tying_config": null,
+  "deep_adaptor_tuning": false,
+  "deep_adaptor_tuning_ffn_only": false,
+  "dropout": 0.1,
+  "embed_low_rank_dim": 0,
+  "encoder_adaptor_tying_config": null,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "encoder_tying_config": null,
+  "eos_token_id": 32001,
+  "expert_ffn_size": 128,
+  "features_embed_dims": null,
+  "features_vocab_sizes": null,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "gradient_reversal_for_domain_classifier": false,
+  "hypercomplex": false,
+  "hypercomplex_n": 2,
+  "ia3_adaptors": false,
+  "init_std": 0.02,
+  "initialization_scheme": "static",
+  "is_encoder_decoder": true,
+  "layernorm_adaptor_input": false,
+  "layernorm_prompt_projection": false,
+  "lora_adaptor_rank": 2,
+  "lora_adaptors": false,
+  "max_position_embeddings": 1024,
+  "mid_fusion_layers": 3,
+  "model_type": "mbart",
+  "moe_adaptors": false,
+  "multi_source": false,
+  "multi_source_method": null,
+  "multilayer_softmaxing": null,
+  "no_embed_norm": false,
+  "no_positional_encoding_decoder": false,
+  "no_positional_encoding_encoder": false,
+  "no_projection_prompt": false,
+  "no_scale_attention_embedding": false,
+  "num_domains_for_domain_classifier": 1,
+  "num_experts": 8,
+  "num_hidden_layers": 6,
+  "num_moe_adaptor_experts": 4,
+  "num_prompts": 100,
+  "num_sparsify_blocks": 8,
+  "pad_token_id": 0,
+  "parallel_adaptors": false,
+  "positional_encodings": false,
+  "postnorm_decoder": false,
+  "postnorm_encoder": false,
+  "prompt_dropout": 0.1,
+  "prompt_init_std": 0.02,
+  "prompt_projection_hidden_size": 4096,
+  "prompt_tuning": false,
+  "recurrent_projections": 1,
+  "residual_connection_adaptor": false,
+  "residual_connection_prompt": false,
+  "rope_encoding": false,
+  "scale_embedding": false,
+  "softmax_bias_tuning": false,
+  "softmax_temperature": 1.0,
+  "sparsification_temperature": 3.0,
+  "sparsify_attention": false,
+  "sparsify_ffn": false,
+  "target_vocab_size": 0,
+  "temperature_calibration": false,
+  "tokenizer_class": "AlbertTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "unidirectional_encoder": false,
+  "use_cache": true,
+  "use_moe": false,
+  "use_tanh_activation_prompt": false,
+  "vocab_size": 32004,
+  "wait_k": -1
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 32000,
+  "eos_token_id": 32001,
+  "forced_eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.52.4"
+}

khmerhomophonecorrector/.DS_Store ADDED Viewed

Binary file (12.3 kB). View file

khmerhomophonecorrector/.gitattributes ADDED Viewed

	@@ -0,0 +1,46 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+batch_size_impact.png filter=lfs diff=lfs merge=lfs -text
+data/train.json filter=lfs diff=lfs merge=lfs -text
+header.png filter=lfs diff=lfs merge=lfs -text
+homophone_pairs.json filter=lfs diff=lfs merge=lfs -text
+loss_comparison.png filter=lfs diff=lfs merge=lfs -text
+metrics_comparison.png filter=lfs diff=lfs merge=lfs -text
+training_loss.png filter=lfs diff=lfs merge=lfs -text
+visualization/batch_size_impact.png filter=lfs diff=lfs merge=lfs -text
+visualization/loss_comparison.png filter=lfs diff=lfs merge=lfs -text
+visualization/metrics_comparison.png filter=lfs diff=lfs merge=lfs -text
+visualization/training_loss.png filter=lfs diff=lfs merge=lfs -text

khmerhomophonecorrector/FYP_ Model Tracking - Sheet1.csv ADDED Viewed

	@@ -0,0 +1,107 @@

+Model Name,Batch Size,Num of Epochs,Epochs,Train Loss,Val Loss,WER,BLEU-1,BLEU-2,BLEU-3,BLEU-4,Notes,
+prahokbart_base,8,10,1,0.1789,0.118885,0.0095,99.4068,98.8971,98.4134,97.9704,https://drive.google.com/drive/folders/16BOObaDAzmx6yl__UavLhbQUQj1JiTph,
+,,,2,0.0808,0.066198,,,,,,,
+,,,3,0.0648,0.049457,,,,,,,
+,,,4,0.0466,0.040769,,,,,,,
+,,,5,0.0402,0.035832,,,,,,,
+,,,6,0.029,0.032629,,,,,,,
+,,,7,0.0419,0.030779,,,,,,,
+,,,8,0.0199,0.030187,,,,,,,
+,,,9,0.018,0.029398,,,,,,,
+,,,10,0.017,0.028081,,,,,,,
+,,,,,,,,,,,,
+prahokbart_base,16,10,1,0.344,0.243551,0.0146,98.9618,98.0396,97.1388,96.2967,prahokbart-base-E10-B16.ipynb,prahokbart-base-bs16-e10
+,,,2,0.1701,0.127156,,,,,,,
+,,,3,0.113,0.094204,,,,,,,
+,,,4,0.0994,0.077294,,,,,,,
+,,,5,0.0826,0.06774,,,,,,,
+,,,6,0.0744,0.061196,,,,,,,
+,,,7,0.0727,0.056898,,,,,,,
+,,,8,0.0583,0.054359,,,,,,,
+,,,9,0.0512,0.053133,,,,,,,
+,,,10,0.0567,0.052445,,,,,,,
+,,,,,,,,,,,,
+prahokbart_base,32,10,1,0.4248,,0.0217,98.3017,96.752,95.2542,93.8637,prahokbart-base-E10-B32.ipynb,prahokbart-base-bs32-e10
+,,,2,0.2198,0.1806311905,,,,,,,
+,,,3,0.1831,,,,,,,,
+,,,4,0.15,,,,,,,,
+,,,5,0.137,0.1235590726,,,,,,,
+,,,6,0.1273,,,,,,,,
+,,,7,0.1204,0.1003917083,,,,,,,
+,,,8,0.1096,,,,,,,,
+,,,9,0.1099,,,,,,,,
+,,,10,0.1061,0.09450948983,,,,,,,
+,,,,,,,,,,,,
+prahokbart_big,8,10,1,0.1789,0.118885,0.0095,99.4068,98.8971,98.4134,97.9704,prahokbart-big-E10-B8.ipynb,prahokbart-big-bs8-e10
+,,,2,0.0808,0.066198,,,,,,,
+,,,3,0.0648,0.049457,,,,,,,
+,,,4,0.0466,0.040769,,,,,,,
+,,,5,0.0402,0.035832,,,,,,,
+,,,6,0.029,0.032629,,,,,,,
+,,,7,0.0419,0.030779,,,,,,,
+,,,8,0.0199,0.030187,,,,,,,
+,,,9,0.018,0.029398,,,,,,,
+,,,10,0.0268,0.029082,,,,,,,
+,,,,,,,,,,,,
+prahokbart_big,16,10,1,0.237,0.156053,0.012,99.1946,98.5264,97.8799,97.2795,prahokbart-big-E10-B16.ipynb,prahokbart-big-bs16-e10
+,,,2,0.1137,0.080692,,,,,,,
+,,,3,0.08,0.062454,,,,,,,
+,,,4,0.0672,0.051034,,,,,,,
+,,,5,0.0537,0.045366,,,,,,,
+,,,6,0.0474,0.041196,,,,,,,
+,,,7,0.048,0.038459,,,,,,,
+,,,8,0.0366,0.036974,,,,,,,
+,,,9,0.0305,0.036123,,,,,,,
+,,,10,0.038,0.035709,,,,,,,
+,,,,,,,,,,,,
+prahokbart_big,32,10,1,0.4347,0.328299,0.0142,99.0066,98.1694,97.3646,96.6186,prahokbart-big-E10-B32.ipynb,prahokbart-big-bs32-e10
+,,,2,0.1448,0.107667,,,,,,,
+,,,3,0.1,0.080751,,,,,,,
+,,,4,0.0857,0.066501,,,,,,,
+,,,5,0.0717,0.059016,,,,,,,
+,,,6,0.0608,0.053938,,,,,,,
+,,,7,0.0606,0.050479,,,,,,,
+,,,8,0.0569,0.048502,,,,,,,
+,,,9,0.0569,0.047486,,,,,,,
+,,,10,0.0484,0.047022,,,,,,,
+,,,,,,,,,,,,
+prahokbart_big,32,40,1,0.6786,0.59872,0.008,99.5398,99.162,98.8093,98.4861,prahokbart-big-E10-B32.ipynb,
+,,,2,0.3993,0.318888,,,,,,,
+,,,3,0.1638,0.126617,,,,,,,
+,,,4,0.1196,0.088467,,,,,,,
+,,,5,0.0861,0.068045,,,,,,,
+,,,6,0.0663,0.056211,,,,,,,
+,,,7,0.0599,0.0488,,,,,,,
+,,,8,0.0516,0.043238,,,,,,,
+,,,9,0.047,0.039321,,,,,,,
+,,,10,0.0357,0.035333,,,,,,,
+,,,11,0.0377,0.03289,,,,,,,
+,,,12,0.0335,0.030855,,,,,,,
+,,,13,0.0279,0.029597,,,,,,,
+,,,14,0.0362,0.028269,,,,,,,
+,,,15,0.0206,0.027406,,,,,,,
+,,,16,0.0229,0.026543,,,,,,,
+,,,17,0.0197,0.026183,,,,,,,
+,,,18,0.0167,0.025577,,,,,,,
+,,,19,0.0181,0.02498,,,,,,,
+,,,20,0.0153,0.024927,,,,,,,
+,,,21,0.0137,0.024544,,,,,,,
+,,,22,0.0166,0.024343,,,,,,,
+,,,23,0.0134,0.024054,,,,,,,
+,,,24,0.0121,0.023849,,,,,,,
+,,,25,0.015,0.023575,,,,,,,
+,,,26,0.0114,0.023603,,,,,,,
+,,,27,0.0107,0.023624,,,,,,,
+,,,28,0.0113,0.023694,,,,,,,
+,,,29,0.0113,0.02336,,,,,,,
+,,,30,0.0087,0.023514,,,,,,,
+,,,31,0.0103,0.023472,,,,,,,
+,,,32,0.0082,0.023636,,,,,,,
+,,,33,0.0112,0.02359,,,,,,,
+,,,34,0.0086,0.023592,,,,,,,
+,,,35,0.0081,0.023537,,,,,,,
+,,,36,0.009,0.023482,,,,,,,
+,,,37,0.0089,0.023521,,,,,,,
+,,,38,0.009,0.023539,,,,,,,
+,,,39,0.0078,0.02354,,,,,,,
+,,,40,0.0091,0.023525,,,,,,,

khmerhomophonecorrector/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: apache-2.0
+---

khmerhomophonecorrector/app.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import streamlit as st
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBartForConditionalGeneration
+import json
+from khmernltk import word_tokenize
+import torch
+import difflib
+# Set page config
+st.set_page_config(
+    page_title="Khmer Homophone Corrector",
+    page_icon="✍️",
+    layout="wide"
+)
+# Custom CSS
+st.markdown("""
+    <style>
+    .main {
+        padding: 2rem;
+    }
+    .stTextArea textarea {
+        font-size: 1.2rem;
+    }
+    .result-text {
+        font-size: 1.2rem;
+        padding: 1rem;
+        background-color: #f8f9fa;
+        border-radius: 0.5rem;
+        margin: 0.5rem 0;
+    }
+    .correction {
+        background-color: #ffd700;
+        padding: 0.2rem;
+        border-radius: 0.2rem;
+    }
+    .correction-details {
+        font-size: 1rem;
+        color: #666;
+        margin-top: 0.5rem;
+    }
+    .header-image {
+        width: 100%;
+        max-width: 800px;
+        margin: 0 auto;
+        display: block;
+    }
+    .model-info {
+        font-size: 0.9rem;
+        color: #666;
+        margin-top: 0.5rem;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+# Display header image
+st.image("header.png", use_column_width=True)
+# Model configurations
+MODEL_CONFIG = {
+    "path": "./prahokbart-big-bs32-e40",
+    "description": "Large model with batch size 32, trained for 40 epochs"
+}
+def word_segment(text):
+    return " ".join(word_tokenize(text)).replace("   ", " ▂ ")
+def find_corrections(original, corrected):
+    original_words = [w for w in word_tokenize(original) if w.strip()]
+    corrected_words = [w for w in word_tokenize(corrected) if w.strip()]
+    matcher = difflib.SequenceMatcher(None, original_words, corrected_words)
+    corrections = []
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag != 'equal':
+            original_text = ' '.join(original_words[i1:i2])
+            corrected_text = ' '.join(corrected_words[j1:j2])
+            if original_text.strip() and corrected_text.strip() and original_text != corrected_text:
+                corrections.append({
+                    'original': original_text,
+                    'corrected': corrected_text,
+                    'position': i1
+                })
+    return corrections
+@st.cache_resource
+def load_model(model_path):
+    try:
+        model = MBartForConditionalGeneration.from_pretrained(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        model.eval()
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = model.to(device)
+        return {
+            "model": model,
+            "tokenizer": tokenizer,
+            "device": device
+        }
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None
+def process_text(text, model_components):
+    if model_components is None:
+        return "Error: Model not loaded properly"
+    model = model_components["model"]
+    tokenizer = model_components["tokenizer"]
+    device = model_components["device"]
+    segmented_text = word_segment(text)
+    input_text = f"{segmented_text} </s> <2km>"
+    inputs = tokenizer(
+        input_text,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=1024,
+        add_special_tokens=True
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    if 'token_type_ids' in inputs:
+        del inputs['token_type_ids']
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_length=1024,
+            num_beams=5,
+            early_stopping=True,
+            do_sample=False,
+            no_repeat_ngram_size=3,
+            forced_bos_token_id=32000,
+            forced_eos_token_id=32001,
+            length_penalty=1.0,
+            temperature=1.0
+        )
+    corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    corrected = corrected.replace("</s>", "").replace("<2km>", "").replace("▂", " ").strip()
+    return corrected
+# Header
+st.title("✍️ Khmer Homophone Corrector")
+# Simple instruction
+st.markdown("Type or paste your Khmer text below to correct homophones.")
+# Create two columns for input and output
+col1, col2 = st.columns(2)
+with col1:
+    st.subheader("Input Text")
+    user_input = st.text_area(
+        "Enter Khmer text with homophones:",
+        height=200,
+        placeholder="Type or paste your Khmer text here...",
+        key="input_text"
+    )
+    correct_button = st.button("🔄 Correct Text", type="primary", use_container_width=True)
+with col2:
+    st.subheader("Results")
+    if correct_button and user_input:
+        with st.spinner("Processing..."):
+            try:
+                # Load model
+                model_components = load_model(MODEL_CONFIG["path"])
+                # Process the text
+                corrected = process_text(user_input, model_components)
+                # Find corrections
+                corrections = find_corrections(user_input, corrected)
+                # Display results
+                st.markdown("**Corrected Text:**")
+                st.markdown(f'<div class="result-text">{corrected}</div>', unsafe_allow_html=True)
+                # Show corrections if any were made
+                if corrections:
+                    st.success(f"Found {len(corrections)} corrections!")
+                    st.markdown("**Corrections made:**")
+                    for i, correction in enumerate(corrections, 1):
+                        st.markdown(f"""
+                            <div class="correction-details">
+                                {i}. Changed "{correction['original']}" to "{correction['corrected']}"
+                            </div>
+                        """, unsafe_allow_html=True)
+                else:
+                    st.warning("No corrections were made.")
+            except Exception as e:
+                st.error(f"An error occurred: {str(e)}")
+    elif correct_button:
+        st.warning("Please enter text first!")
+# Footer
+st.markdown("---")
+st.markdown("""
+    <div style='text-align: center; padding: 10px;'>
+        <a href='https://sites.google.com/paragoniu.edu.kh/khmerhomophonecorrector/home'
+           target='_blank'
+           style='text-decoration: none; color: #1f77b4; font-size: 16px;'>
+           📚 Learn more about this project
+        </a>
+    </div>
+""", unsafe_allow_html=True)

khmerhomophonecorrector/batch_size_impact.png ADDED Viewed

Git LFS Details

SHA256: d9510d10f251a7599d02e95835c1ceb6ec335763fbb0da3cf23d8a7ddda0eb9f
Pointer size: 131 Bytes
Size of remote file: 447 kB

khmerhomophonecorrector/data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

khmerhomophonecorrector/data/test.json ADDED Viewed

The diff for this file is too large to render. See raw diff

khmerhomophonecorrector/data/train.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7d1f07ca76eb9e270b523e5ccd476f348d996c68d94a7c050c9313ea5b43834
+size 25445389

khmerhomophonecorrector/data/val.json ADDED Viewed

The diff for this file is too large to render. See raw diff

khmerhomophonecorrector/dataset_distribution.png ADDED Viewed

khmerhomophonecorrector/header.png ADDED Viewed

Git LFS Details

SHA256: 0653c5830c693e074123c68176f3d10d47bbea9c94cf14c9ff94db9d8b0aacbf
Pointer size: 132 Bytes
Size of remote file: 3.8 MB

khmerhomophonecorrector/homophone_pairs.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f1b62ec06518733f86834a18e229ed67dc0491f0074b9acfbfce96d71033162
+size 32176015

khmerhomophonecorrector/homophone_test.json ADDED Viewed

	@@ -0,0 +1,272 @@

+{
+  "homophones": [
+      ["ក", "ក៏", "ករ", "ករណ៍"],
+      ["កល", "កល់"],
+      ["កាប់", "កប្ប"],
+      ["កាប", "កាព្យ"],
+      ["កូត", "កូដ"],
+      ["កាំ", "កម្ម"],
+      ["កេះ", "កែះ", "កេស", "កែស"],
+      ["ក្រិត", "ក្រឹត្យ", "ក្រឹត", "ក្រិដ្ឋ"],
+      ["កាណ៌", "ការណ៍", "ការ្យ"],
+      ["ក្លា", "ខ្លា"],
+      ["កាន់", "កណ្ឌ"],
+      ["កួរ", "គួរ"],
+      ["កេរ", "កេរ្តិ៍", "គេ", "គែ", "គេហ៍"],
+      ["ក្មួយ", "ខ្មួយ"],
+      ["ក្លាស់", "ខ្លះ"],
+      ["ក្លែង", "ខ្លែង"],
+      ["ក្រាស", "ក្រាស់"],
+      ["ក្រិស", "ក្រេស"],
+      ["កំពង់", "កំពុង"],
+      ["ក្រំ", "ក្រម", "គ្រាំ"],
+      ["ក្រួស", "គ្រោះ"],
+      ["កោន", "កោណ"],
+      ["កោត", "កោដ្ឋ", "កោដិ"],
+      ["កុះ", "កោស", "កូស"],
+      ["កន្លះ", "កន្លាស់"],
+      ["ខ្ចប់", "ខ្ជាប់"],
+      ["ខន្ធ", "ខ័ន", "ខាន់", "ខណ្ឌ"],
+      ["ខុរ", "ខុល"],
+      ["ខ្វេះ", "ខ្វែះ"],
+      ["ខែត្រ", "ខេត្ត"],
+      ["គន់", "គន្ធ", "គុន", "គុណ", "គណ"],
+      ["គត", "គត់", "គុត"],
+      ["គប់", "គុប"],
+      ["គល់", "គុល", "គឹល"],
+      ["គាថា", "កថា"],
+      ["គុំ", "គំ", "គុម្ព", "គមន៏", "គម"],
+      ["គូថ", "គូទ", "គូធ"],
+      ["គ្រា", "គ្រាហ៍"],
+      ["គ្រំ", "គ្រុំ", "គ្រុម"],
+      ["រងំ", "រងុំ"],
+      ["ចរណ៍", "ជ័រ"],
+      ["ចប់", "ជាប់", "ចប"],
+      ["ចារ", "ចារុ៍"],
+      ["ចារិក", "ចារឹក", "ចរិត"],
+      ["ចាក់", "ចក្រ", "ចក្ក"],
+      ["ច័ន", "ចាន់", "ចណ្ឌ", "ចន្ទ", "ចន្រ្ទ", "ចន្ទន៍"],
+      ["ចិត", "ចិត្ត", "ចិត្រ្ត"],
+      ["ចិក", "ចឹក"],
+      ["ចូរ", "ចូល", "ចូឡ"],
+      ["ចេះ", "ចេស", "ចែស", "ជែះ", "ជេះ", "ជេស្ឋ"],
+      ["ច្នៃ", "ឆ្នៃ"],
+      ["ច្រែស", "ច្រេះ", "ច្រេស", "ច្រែះ"],
+      ["ច្រាស", "ច្រាស់"],
+      ["ច្រោះ", "ច្រស"],
+      ["ច្រៀង", "ជ្រៀង"],
+      ["ចោទ", "ចោត"],
+      ["ចំណោត", "ចំណោទ"],
+      ["ឆន្ទ", "ឆាន់"],
+      ["ឆ្វេង", "ឈ្វេង"],
+      ["ជង", "ជង់", "ជង្ឃ"],
+      ["ជច់", "ជុច"],
+      ["ជល", "ជល់", "ជុល"],
+      ["ជន់", "ជន", "ជន្ម"],
+      ["ជីរ", "ជី", "ជីវ៍"],
+      ["ជីប", "ជីព"],
+      ["ជប", "ជប់"],
+      ["ជួស", "ជោះ"],
+      ["ជំនួស", "ជំនោះ"],
+      ["ជំនំ", "ជំនុំ"],
+      ["ជោក", "ជោគ"],
+      ["ជំ", "ជុំ"],
+      ["ជំរំ", "ជុំរុំ"],
+      ["ជ្រង", "ជ្រោង"],
+      ["ជ្រង់", "ជ្រុង"],
+      ["ជ្រួយ", "ជ្រោយ"],
+      ["ជ្រួស", "ជ្រោះ"],
+      ["ឈឹង", "ឆឹង"],
+      ["ញុះ", "ញោះ", "ញោស"],
+      ["ដ", "ដរ", "ដ៏"],
+      ["ដប", "ដប់"],
+      ["ដា", "ដារ"],
+      ["ដាស", "ដាស់"],
+      ["ដុះ", "ដុស"],
+      ["ណ៎ះ", "ណាស់"],
+      ["ត្រប់", "ទ្រាប់", "ទ្រព្យ"],
+      ["ត្លុក", "ថ្លុក"],
+      ["តិះ", "តេះ", "តេស្ត"],
+      ["ត្រិះ", "ត្រែះ", "ត្រេះ"],
+      ["ទង់", "ទុង"],
+      ["ទប់", "ទព្វ"],
+      ["ទល់", "ទុល"],
+      ["ទាល់", "ទ័ល"],
+      ["ទន់", "ទុន"],
+      ["ទន្ត", "ទណ្ឌ", "ទាន់"],
+      ["ទា", "ទារ"],
+      ["ទិច", "ទិត្យ", "តិច"],
+      ["ទំ", "ទុំ", "ទម"],
+      ["ទុក", "ទុក្ខ"],
+      ["ទូ", "ទូរ"],
+      ["ទាប", "ទៀប", "តៀប"],
+      ["ទិញ", "ទេញ"],
+      ["ទៃ", "ទេយ្យ", "ទ័យ"],
+      ["ទេស", "ទេសន៍", "ទែះ"],
+      ["ទោះ", "ទស", "ទស់", "ទស្សន៍"],
+      ["ទោ", "ទោរ"],
+      ["ទ្រង់", "ទ្រុង"],
+      ["ធន", "ធន់", "ធុន"],
+      ["ធំ", "ធុំ"],
+      ["ធុញ", "ធញ្ញ"],
+      ["នប់", "នព្វ"],
+      ["និង", "នឹង", "ហ្នឹង"],
+      ["នោះ", "នុះ", "នុ៎ះ"],
+      ["នៅ", "នូវ"],
+      ["នាក់", "អ្នក"],
+      ["នាដ", "នាថ"],
+      ["នរនាថ", "នរនាទ"],
+      ["នាល", "នាឡិ"],
+      ["និមិត្ត", "និមិ្មត"],
+      ["នៃ", "ន័យ", "នី"],
+      ["បក្ខ", "បក្ស", "ប៉ាក់"],
+      ["បញ្ចប់", "បញ្ជាប់"],
+      ["បណ្ឌិត", "បណ្ឌិត្យ"],
+      ["បាត់", "បត្រ", "បត្ត", "បត្តិ", "ប័តន៍", "ប័ត", "ប័ទ"],
+      ["បត់", "បទ", "ប័ទ្ម", "បដ", "បថ"],
+      ["បន្ទំ", "បន្ទុំ"],
+      ["បាទ", "បាត", "បាត្រ"],
+      ["បិណ្ឌ", "បិន"],
+      ["បាស", "បះ"],
+      ["បុះ", "បុស្ស", "បូស", "បុស្ប"],
+      ["បិត", "បិទ"],
+      ["បូ", "បូព៌", "បូណ៌"],
+      ["បរិបូរ", "បរិបូរណ៍", "បរិបូរណ៌"],
+      ["បម្រះ", "បម្រាស", "បម្រាស់"],
+      ["ប្រី", "ប្រិយ"],
+      ["ប្រឹស", "ប្រឹះ", "ប្រឹស្ឋ", "ប្រើស"],
+      ["ប្រសិទ្ធ", "ប្រសិទ្ធិ័"],
+      ["ប្រសូត", "ប្រសូតិ"],
+      ["ប្រុស", "ប្រុះ"],
+      ["ប្រួញ", "ព្រួញ"],
+      ["ប្រួល", "ព្រួល"],
+      ["ប្រះ", "ប្រាស", "ប្រាស់"],
+      ["ប្រមាថ", "ប្រមាទ"],
+      ["អប្រមាថ", "អប្រមាទ"],
+      ["ប្រៀប", "ព្រៀប", "ព្រាប"],
+      ["ប្រោះ", "ប្រស់", "ប្រស", "ប្រោស"],
+      ["ប្រសប់", "ប្រសព្វ"],
+      ["ប្រេះ", "ប្រែះ"],
+      ["ប្លៀក", "ភ្លៀក", "ផ្លៀក"],
+      ["ពន្លត់", "ពន្លុត"],
+      ["ពាត", "ពាធ", "ពាទ្យ"],
+      ["ពារ", "ពៀរ"],
+      ["ព្រាង", "ព្រៀង"],
+      ["ពិន", "ពិណ", "បុិន"],
+      ["ពៃ", "ពៃរ៏"],
+      ["ពេចន៍", "ពេជ្ឈ", "ពេជ្រ", "ពេច", "ពិច"],
+      ["ពង់", "ពង្ស", "ពុង"],
+      ["ព័ទ្ធ", "ព៌ត", "ពត្តិ", "ពាត់", "ព័ត"],
+      ["ពុត", "ពុធ", "ពុទ្ធ", "ពត់"],
+      ["ពន់", "ពុន", "ពន្ធ"],
+      ["ព័ន្ធ", "ពាន់"],
+      ["ពល", "ពល់", "ពុល"],
+      ["ពស់", "ពោះ"],
+      ["ពព្រុស", "ពព្រូស"],
+      ["ពោរ", "ពោធិ៍", "ពោធិ", "ពោ"],
+      ["ព្រិច", "ព្រេច"],
+      ["ព្រិល", "ព្រឹល"],
+      ["ព្រឹត្ត", "ព្រឹត្តិ", "ព្រឹទ្ធ"],
+      ["ព្រុស", "ព្រួស"],
+      ["ព្រួស", "ព្រោះ"],
+      ["ព្រំ", "ព្រហ្ម"],
+      ["ព្រឹក", "ព្រឹក្ស"],
+      ["ព្រឹក្សា", "ប្រឹក្សា"],
+      ["ភប់", "ភព"],
+      ["ភក្តិ", "ភ័ក", "ភក្រ្ត", "ភ័គ", "ភក្ស"],
+      ["ភាន់", "ភ័ន្ត", "ភ័ណ្ឌ", "ភ័ណ"],
+      ["មិត្ត", "មិទ្ធៈ", "មឹត"],
+      ["មួ", "មួរ"],
+      ["ម៉ដ្ធ", "ម៉ត់"],
+      ["ម្រាក់", "ម្រ័ក្សណ៍"],
+      ["យន់", "យន្ត", "យ័ន្ត", "យ័ន"],
+      ["រង់", "រុង", "រង្គ", "រង"],
+      ["រថ", "រដ្ឋ", "រត្ន", "រាត់"],
+      ["រា", "រាហុ៍"],
+      ["រាក", "រាគ"],
+      ["រាក់", "រក្ស", "រ័ក"],
+      ["រាច", "រាជ", "រាជ្យ"],
+      ["រាម", "រៀម"],
+      ["រស", "រស់", "រួស", "រោះ"],
+      ["រាស់", "រ៉ស់"],
+      ["រុិល", "រឹល"],
+      ["រុក", "រុក្ខ"],
+      ["រុត", "រុទ្ធ", "រុត្តិ"],
+      ["រុះ", "រូស"],
+      ["រំ", "រុំ", "រម្យ"],
+      ["រំលិច", "រំលេច"],
+      ["រោច", "រោចន៍"],
+      ["របោះ", "របស់"],
+      ["រឹង", "រុឹង"],
+      ["រាំ", "រម្មណ៍"],
+      ["រៀបរប", "រៀបរាប់"],
+      ["លក់", "ល័ក្ត", "លក្ខណ៍", "ល័ក្ខ", "លក្ម្សណ៍"],
+      ["លាប", "លាភ", "លៀប"],
+      ["លង់", "លុង"],
+      ["លន់", "លុន"],
+      ["លប់", "លុប"],
+      ["លោះ", "លស់", "លួស"],
+      ["លិច", "លេច"],
+      ["លាង", "លៀង"],
+      ["លុត", "លត់", "លុត្ត"],
+      ["លាប់", "ឡប់"],
+      ["លិទ្ធ", "លិឍ", "លិត"],
+      ["លួង", "ហ្លួង"],
+      ["លេស", "លេះ"],
+      ["ល្បះ", "ល្បាស់"],
+      ["វង់", "វង្ស"],
+      ["វន្ត", "វ័ន", "វាន់"],
+      ["វត្ត", "វត្ស", "វ័ធ", "វត្ថ", "វដ្ត", "វឌ្ឍន៍", "វាត់", "វត្តន៍"],
+      ["វ័យ", "វៃ", "វាយ", "វ៉ៃ"],
+      ["វាត", "វាទ"],
+      ["វិច", "វេច", "វេជ្ជ", "វេច្ច"],
+      ["វិញ", "វេញ"],
+      ["វាច", "វៀច"],
+      ["វាង", "វៀង"],
+      ["វាល", "វៀល"],
+      ["សង់", "សង្ឃ"],
+      ["ស័ក", "ស័ក្តិ", "សក្យ", "សគ្គ", "សគ៌ៈ"],
+      ["ស័ង្ខ", "សាំង"],
+      ["សស្ត្រា", "សាស្ត្រា"],
+      ["សត្វ", "សត", "សត្យ", "សាត់"],
+      ["សប្ត", "សព្ទ", "សាប់", "សប្ប"],
+      ["សប", "សប់", "សព្វ", "សព", "សប្តិ"],
+      ["សាសន៍", "សស្ត្រ", "សះ"],
+      ["សិត", "សិទ្ធ", "សិទ្ធិ"],
+      ["សិង", "សិង្ហ", "សឹង", "សុឹង"],
+      ["សុក", "សុក្ក", "សុខ", "សុក្រ"],
+      ["សិរ", "សិរ្ស", "សេ", "សេរ"],
+      ["សូ", "សូរ", "សូរ្យ", "សូល៍"],
+      ["សូទ", "សូត", "សូត្រ", "សូធ្យ", "សូទ្រ"],
+      ["សូន", "សូន្យ"],
+      ["សូម", "សុំ"],
+      ["សួ", "សួរ", "សួគ៌"],
+      ["សេដ្ធ", "សេត"],
+      ["សោត", "សោធ", "សោធន៍"],
+      ["សំ", "សម"],
+      ["សម្បត្តិ", "សម្ប័ទ"],
+      ["សម្បូរ", "សម្បូណ៍"],
+      ["សម្រិត", "សំរឹទ្ធ"],
+      ["សមិត", "សមិតិ", "សមិទ្ធ", "សមិទ្ធិ"],
+      ["ស្និត", "ស្និទ្ធ"],
+      ["ស្រស", "ស្រស់"],
+      ["ស្រុះ", "ស្រុស"],
+      ["ស្រះ", "ស្រាស់"],
+      ["ស្លេះ", "ស្លេស្ម"],
+      ["សេស", "សេះ"],
+      ["ហត្ថ", "ហាត់"],
+      ["ហស", "ហស្ត", "ហស្ថ", "ហោះ", "ហស្បតិ៍"],
+      ["ហាស", "ហ័ស", "ហស្ស"],
+      ["ហោង", "ហង"],
+      ["អក", "អករ៍"],
+      ["អ័ក្ស", "អាក់"],
+      ["អង់", "អង្គ", "អង"],
+      ["អដ្ឋ", "អត្ថ", "អឌ្ឍ", "អត្ត", "អាត់"],
+      ["អន់", "អន្ធ"],
+      ["អាចារ", "អាចារ្យ"],
+      ["អាថ៌", "អាទិ"],
+      ["អាប់", "អប្ប", "អ័ព្ទ"],
+      ["អារម្មណ៍", "អារម្ភ"],
+      ["ឥត", "ឥដ្ឋ", "ឥទ្ធិ"]
+  ]
+}

khmerhomophonecorrector/infer_from_json.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from khmernltk import word_tokenize
+import json
+import time
+import argparse
+import os
+from datetime import datetime
+import random
+import re
+from collections import defaultdict
+def normalize_text(text):
+    # Remove all spaces and special markers
+    text = re.sub(r'\s+', '', text)
+    text = re.sub(r'[«»]', '', text)
+    return text
+def word_segment(text):
+    return " ".join(word_tokenize(text)).replace("   ", " ▂ ")
+def format_output(text):
+    """Format text with proper Unicode handling"""
+    return text.replace("</s>", "").replace("<2km>", "").replace("▂", " ").strip()
+def load_homophone_groups(homophone_json):
+    with open(homophone_json, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    groups = []
+    for group in data['homophones']:
+        groups.append([normalize_text(word) for word in group])
+    return groups
+def find_homophones_in_sentence(sentence, homophone_groups):
+    found = []
+    for group in homophone_groups:
+        for word in group:
+            if word in sentence:
+                found.append((word, group))
+                break  # Only count the first found from the group
+    return found
+def analyze_homophone_changes(input_text, output_text, homophone_groups):
+    input_norm = normalize_text(input_text)
+    output_norm = normalize_text(output_text)
+    # Find homophones in input sentence
+    input_homophones = find_homophones_in_sentence(input_norm, homophone_groups)
+    if not input_homophones:
+        return None, []  # No homophones found
+    # Track replacements
+    replacements = []
+    all_correct = True
+    for word, group in input_homophones:
+        # Check if the word was replaced with a different homophone
+        if word in output_norm:
+            # Don't add unchanged words to replacements
+            continue
+        else:
+            # Find which homophone from the group was used
+            replacement = None
+            for alt_word in group:
+                if alt_word in output_norm:
+                    replacement = alt_word
+                    break
+            if replacement:
+                replacements.append(f"'{word}' → '{replacement}'")
+                # If the replacement is from the same homophone group, it's correct
+                if replacement in group:
+                    continue  # This is a correct replacement
+                else:
+                    all_correct = False
+            else:
+                replacements.append(f"'{word}' (missing in output)")
+                all_correct = False
+    # If there are no replacements, return None to indicate no changes
+    if not replacements:
+        return None, []
+    return all_correct, replacements
+def process_text(text, model, tokenizer, device):
+    """Process a single text input"""
+    # Word segment the input text
+    segmented_text = word_segment(text)
+    input_text = f"{segmented_text} </s> <2km>"
+    # Encode input
+    inputs = tokenizer(
+        input_text,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=512,
+        add_special_tokens=True,
+        return_token_type_ids=False
+    )
+    # Move inputs to device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Generate
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_length=512,
+            num_beams=3,
+            early_stopping=True,
+            do_sample=False,
+            no_repeat_ngram_size=2,
+            forced_bos_token_id=32000,
+            forced_eos_token_id=32001,
+            length_penalty=0.8,
+            temperature=1.0
+        )
+    # Decode and format output
+    corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return format_output(corrected)
+def process_json_file(json_file, output_file, model, tokenizer, device, homophone_groups):
+    """Process sentences from JSON file and save results"""
+    print(f"Loading data from: {json_file}")
+    # Read JSON file
+    with open(json_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    # Extract sentences from the JSON structure
+    sentences = []
+    if isinstance(data, dict) and "homophones" in data:
+        # Handle homophone_test.json format
+        for homophone_group in data["homophones"]:
+            for word in homophone_group:
+                sentences.append(word)
+    elif isinstance(data, list):
+        # Handle test.json format (list of dicts with 'input' key)
+        if all(isinstance(item, dict) and 'input' in item for item in data):
+            sentences = [item['input'].strip() for item in data if item.get('input', '').strip()]
+        else:
+            sentences = [str(text).strip() for text in data if str(text).strip()]
+    else:
+        # Handle dictionary format
+        for char, text_list in data.items():
+            for text in text_list:
+                # Clean up the text (remove quotes and extra spaces)
+                text = text.strip('«»').strip()
+                if text:
+                    sentences.append(text)
+    print(f"Processing {len(sentences)} sentences")
+    # Prepare output file with header
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(f"=== Khmer Homophone Correction Results ===\n")
+        f.write(f"Generated on: {timestamp}\n")
+        f.write(f"Input file: {json_file}\n")
+        f.write(f"Model: prahokbart-big-bs32-e40\n")
+        f.write("=" * 50 + "\n\n")
+    total_time = 0
+    processed_lines = 0
+    total_analyzed = 0
+    total_corrected = 0
+    total_unchanged = 0
+    total_incorrect = 0
+    # Process each sentence
+    for i, sentence in enumerate(sentences, 1):
+        print(f"Processing sentence {i}/{len(sentences)}")
+        start_time = time.time()
+        output = process_text(sentence, model, tokenizer, device)
+        process_time = time.time() - start_time
+        total_time += process_time
+        processed_lines += 1
+        # Analyze homophone changes
+        is_correct, replacements = analyze_homophone_changes(sentence, output, homophone_groups)
+        # Save to output file
+        with open(output_file, 'a', encoding='utf-8') as f:
+            f.write(f"\n=== Sentence {i} ===\n")
+            f.write(f"Input: {sentence}\n")
+            f.write(f"Corrected: {output}\n")
+            if replacements:
+                f.write("Changes: " + ", ".join(replacements) + "\n")
+                f.write(f"Status: {'✓ Correctly corrected' if is_correct else '✗ Incorrect correction'}\n")
+                total_analyzed += 1
+                if is_correct:
+                    total_corrected += 1
+                else:
+                    total_incorrect += 1
+            else:
+                f.write("Status: No changes needed\n")
+                total_unchanged += 1
+            f.write("=" * 50 + "\n")
+    # Calculate accuracy
+    accuracy = (total_corrected / total_analyzed * 100) if total_analyzed > 0 else 0
+    # Add summary at the end
+    with open(output_file, 'a', encoding='utf-8') as f:
+        f.write(f"\nSummary:\n")
+        f.write(f"Total sentences processed: {processed_lines}\n")
+        f.write(f"Sentences needing correction: {total_analyzed}\n")
+        f.write(f"Sentences unchanged (no changes needed): {total_unchanged}\n")
+        f.write(f"Correctly corrected: {total_corrected}\n")
+        f.write(f"Incorrectly corrected: {total_incorrect}\n")
+        f.write(f"Accuracy (among sentences needing correction): {accuracy:.2f}%\n")
+        f.write(f"Total processing time: {total_time:.2f} seconds\n")
+        f.write(f"Average time per sentence: {total_time/processed_lines:.2f} seconds\n")
+    print(f"\nProcessing complete!")
+    print(f"Results saved to: {output_file}")
+    print(f"Total sentences processed: {processed_lines}")
+    print(f"Sentences needing correction: {total_analyzed}")
+    print(f"Sentences unchanged: {total_unchanged}")
+    print(f"Correctly corrected: {total_corrected}")
+    print(f"Incorrectly corrected: {total_incorrect}")
+    print(f"Accuracy: {accuracy:.2f}%")
+    print(f"Total time: {total_time:.2f} seconds")
+    print(f"Average time per sentence: {total_time/processed_lines:.2f} seconds")
+def main():
+    parser = argparse.ArgumentParser(description='Khmer Homophone Corrector - JSON Processing Version')
+    parser.add_argument('--model_path', type=str, default='./prahokbart-big-bs32-e40',
+                      help='Path to the model directory')
+    parser.add_argument('--json_file', type=str, default='data/test.json',
+                      help='Input JSON file containing Khmer text')
+    parser.add_argument('--output_file', type=str, default='test_results.txt',
+                      help='Output file for corrections')
+    parser.add_argument('--homophone_file', type=str, default='homophone_test.json',
+                      help='JSON file containing homophone groups')
+    args = parser.parse_args()
+    # Validate input files
+    if not os.path.exists(args.json_file):
+        print(f"Error: Input file {args.json_file} not found")
+        return
+    if not os.path.exists(args.homophone_file):
+        print(f"Error: Homophone file {args.homophone_file} not found")
+        return
+    print("Loading model...")
+    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    model.eval()
+    print(f"Model loaded successfully. Using device: {device}")
+    # Load homophone groups
+    homophone_groups = load_homophone_groups(args.homophone_file)
+    # Process the JSON file
+    process_json_file(args.json_file, args.output_file, model, tokenizer, device, homophone_groups)
+if __name__ == "__main__":
+    main()

khmerhomophonecorrector/khmerhomophonecorrector/added_tokens.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "</s>": 32001,
+  "<2en>": 32003,
+  "<2km>": 32002,
+  "<s>": 32000
+}

khmerhomophonecorrector/khmerhomophonecorrector/config.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "adaptor_activation_function": "gelu",
+  "adaptor_dropout": 0.1,
+  "adaptor_hidden_size": 512,
+  "adaptor_init_std": 0.02,
+  "adaptor_scaling_factor": 1.0,
+  "adaptor_tuning": false,
+  "additional_source_wait_k": -1,
+  "alibi_encoding": false,
+  "architectures": [
+    "MBartForConditionalGeneration"
+  ],
+  "asymmetric_alibi_encoding": false,
+  "attention_dropout": 0.1,
+  "bos_token_id": 32000,
+  "bottleneck_mid_fusion_tokens": 4,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_adaptor_tying_config": null,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_tying_config": null,
+  "deep_adaptor_tuning": false,
+  "deep_adaptor_tuning_ffn_only": false,
+  "dropout": 0.1,
+  "embed_low_rank_dim": 0,
+  "encoder_adaptor_tying_config": null,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "encoder_tying_config": null,
+  "eos_token_id": 32001,
+  "expert_ffn_size": 128,
+  "features_embed_dims": null,
+  "features_vocab_sizes": null,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "gradient_reversal_for_domain_classifier": false,
+  "hypercomplex": false,
+  "hypercomplex_n": 2,
+  "ia3_adaptors": false,
+  "init_std": 0.02,
+  "initialization_scheme": "static",
+  "is_encoder_decoder": true,
+  "layernorm_adaptor_input": false,
+  "layernorm_prompt_projection": false,
+  "lora_adaptor_rank": 2,
+  "lora_adaptors": false,
+  "max_position_embeddings": 1024,
+  "mid_fusion_layers": 3,
+  "model_type": "mbart",
+  "moe_adaptors": false,
+  "multi_source": false,
+  "multi_source_method": null,
+  "multilayer_softmaxing": null,
+  "no_embed_norm": false,
+  "no_positional_encoding_decoder": false,
+  "no_positional_encoding_encoder": false,
+  "no_projection_prompt": false,
+  "no_scale_attention_embedding": false,
+  "num_domains_for_domain_classifier": 1,
+  "num_experts": 8,
+  "num_hidden_layers": 6,
+  "num_moe_adaptor_experts": 4,
+  "num_prompts": 100,
+  "num_sparsify_blocks": 8,
+  "pad_token_id": 0,
+  "parallel_adaptors": false,
+  "positional_encodings": false,
+  "postnorm_decoder": false,
+  "postnorm_encoder": false,
+  "prompt_dropout": 0.1,
+  "prompt_init_std": 0.02,
+  "prompt_projection_hidden_size": 4096,
+  "prompt_tuning": false,
+  "recurrent_projections": 1,
+  "residual_connection_adaptor": false,
+  "residual_connection_prompt": false,
+  "rope_encoding": false,
+  "scale_embedding": false,
+  "softmax_bias_tuning": false,
+  "softmax_temperature": 1.0,
+  "sparsification_temperature": 3.0,
+  "sparsify_attention": false,
+  "sparsify_ffn": false,
+  "target_vocab_size": 0,
+  "temperature_calibration": false,
+  "tokenizer_class": "AlbertTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "unidirectional_encoder": false,
+  "use_cache": true,
+  "use_moe": false,
+  "use_tanh_activation_prompt": false,
+  "vocab_size": 32004,
+  "wait_k": -1
+}

khmerhomophonecorrector/khmerhomophonecorrector/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 32000,
+  "eos_token_id": 32001,
+  "forced_eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.52.4"
+}

khmerhomophonecorrector/khmerhomophonecorrector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:769ca23f42096c9d6b39066783203be73a2e45501f864d012937ab254c71b784
+size 845114336

khmerhomophonecorrector/khmerhomophonecorrector/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "additional_special_tokens": [
+    "<s>",
+    "</s>",
+    "<2km>",
+    "<2en>"
+  ],
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "[SEP]",
+  "unk_token": "<unk>"
+}

khmerhomophonecorrector/khmerhomophonecorrector/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b052ec9a665776835d69c50d2226e5e55574db8fbbce5563f217cbe18f91d41
+size 783261

khmerhomophonecorrector/khmerhomophonecorrector/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,99 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<2km>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<2en>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<s>",
+    "</s>",
+    "<2km>",
+    "<2en>"
+  ],
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "extra_special_tokens": {},
+  "keep_accents": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "remove_space": true,
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "strip_accents": false,
+  "tokenizer_class": "AlbertTokenizer",
+  "unk_token": "<unk>",
+  "use_fast": false
+}

khmerhomophonecorrector/khmerhomophonecorrector/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52291c446349172438b91dd34e1e8f993856af8b90cf6a3e46e9a38c1a972524
+size 5432

khmerhomophonecorrector/khmerhomophonecorrector/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"completed_epochs": 40, "best_metric": 0.02335953153669834, "checkpoints": ["checkpoint-16160"]}

khmerhomophonecorrector/loss_comparison.png ADDED Viewed

Git LFS Details

SHA256: 303924640edf6b23964e09315eb6cc876d9c6585a85a9a75ee1972f5fd04aeb9
Pointer size: 131 Bytes
Size of remote file: 640 kB

khmerhomophonecorrector/metrics_comparison.png ADDED Viewed

Git LFS Details

SHA256: 301cfa138ced2c6b3feafc91bdd82166e794f8530c7bf315dedd7f02aa10eb30
Pointer size: 131 Bytes
Size of remote file: 460 kB

khmerhomophonecorrector/model_performance_line_chart.html ADDED Viewed

The diff for this file is too large to render. See raw diff

khmerhomophonecorrector/model_performance_line_chart.png ADDED Viewed

khmerhomophonecorrector/model_performance_line_chart_big_bs32_e40.html ADDED Viewed

The diff for this file is too large to render. See raw diff

khmerhomophonecorrector/model_performance_line_chart_big_bs32_e40.png ADDED Viewed

khmerhomophonecorrector/model_performance_table.html ADDED Viewed

	@@ -0,0 +1,80 @@

+<style type="text/css">
+</style>
+<table id="T_4f546">
+  <thead>
+    <tr>
+      <th class="blank level0" >&nbsp;</th>
+      <th id="T_4f546_level0_col0" class="col_heading level0 col0" >Model Config</th>
+      <th id="T_4f546_level0_col1" class="col_heading level0 col1" >BLEU-1</th>
+      <th id="T_4f546_level0_col2" class="col_heading level0 col2" >BLEU-2</th>
+      <th id="T_4f546_level0_col3" class="col_heading level0 col3" >BLEU-3</th>
+      <th id="T_4f546_level0_col4" class="col_heading level0 col4" >BLEU-4</th>
+      <th id="T_4f546_level0_col5" class="col_heading level0 col5" >WER</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th id="T_4f546_level0_row0" class="row_heading level0 row0" >0</th>
+      <td id="T_4f546_row0_col0" class="data row0 col0" >prahokbart_base (BS=8.0)</td>
+      <td id="T_4f546_row0_col1" class="data row0 col1" >99.407</td>
+      <td id="T_4f546_row0_col2" class="data row0 col2" >98.897</td>
+      <td id="T_4f546_row0_col3" class="data row0 col3" >98.413</td>
+      <td id="T_4f546_row0_col4" class="data row0 col4" >97.970</td>
+      <td id="T_4f546_row0_col5" class="data row0 col5" >0.009</td>
+    </tr>
+    <tr>
+      <th id="T_4f546_level0_row1" class="row_heading level0 row1" >1</th>
+      <td id="T_4f546_row1_col0" class="data row1 col0" >prahokbart_base (BS=16.0)</td>
+      <td id="T_4f546_row1_col1" class="data row1 col1" >98.962</td>
+      <td id="T_4f546_row1_col2" class="data row1 col2" >98.040</td>
+      <td id="T_4f546_row1_col3" class="data row1 col3" >97.139</td>
+      <td id="T_4f546_row1_col4" class="data row1 col4" >96.297</td>
+      <td id="T_4f546_row1_col5" class="data row1 col5" >0.015</td>
+    </tr>
+    <tr>
+      <th id="T_4f546_level0_row2" class="row_heading level0 row2" >2</th>
+      <td id="T_4f546_row2_col0" class="data row2 col0" >prahokbart_base (BS=32.0)</td>
+      <td id="T_4f546_row2_col1" class="data row2 col1" >98.302</td>
+      <td id="T_4f546_row2_col2" class="data row2 col2" >96.752</td>
+      <td id="T_4f546_row2_col3" class="data row2 col3" >95.254</td>
+      <td id="T_4f546_row2_col4" class="data row2 col4" >93.864</td>
+      <td id="T_4f546_row2_col5" class="data row2 col5" >0.022</td>
+    </tr>
+    <tr>
+      <th id="T_4f546_level0_row3" class="row_heading level0 row3" >3</th>
+      <td id="T_4f546_row3_col0" class="data row3 col0" >prahokbart_big (BS=8.0)</td>
+      <td id="T_4f546_row3_col1" class="data row3 col1" >99.407</td>
+      <td id="T_4f546_row3_col2" class="data row3 col2" >98.897</td>
+      <td id="T_4f546_row3_col3" class="data row3 col3" >98.413</td>
+      <td id="T_4f546_row3_col4" class="data row3 col4" >97.970</td>
+      <td id="T_4f546_row3_col5" class="data row3 col5" >0.009</td>
+    </tr>
+    <tr>
+      <th id="T_4f546_level0_row4" class="row_heading level0 row4" >4</th>
+      <td id="T_4f546_row4_col0" class="data row4 col0" >prahokbart_big (BS=16.0)</td>
+      <td id="T_4f546_row4_col1" class="data row4 col1" >99.195</td>
+      <td id="T_4f546_row4_col2" class="data row4 col2" >98.526</td>
+      <td id="T_4f546_row4_col3" class="data row4 col3" >97.880</td>
+      <td id="T_4f546_row4_col4" class="data row4 col4" >97.279</td>
+      <td id="T_4f546_row4_col5" class="data row4 col5" >0.012</td>
+    </tr>
+    <tr>
+      <th id="T_4f546_level0_row5" class="row_heading level0 row5" >5</th>
+      <td id="T_4f546_row5_col0" class="data row5 col0" >prahokbart_big (BS=32.0)</td>
+      <td id="T_4f546_row5_col1" class="data row5 col1" >99.007</td>
+      <td id="T_4f546_row5_col2" class="data row5 col2" >98.169</td>
+      <td id="T_4f546_row5_col3" class="data row5 col3" >97.365</td>
+      <td id="T_4f546_row5_col4" class="data row5 col4" >96.619</td>
+      <td id="T_4f546_row5_col5" class="data row5 col5" >0.014</td>
+    </tr>
+    <tr>
+      <th id="T_4f546_level0_row6" class="row_heading level0 row6" >6</th>
+      <td id="T_4f546_row6_col0" class="data row6 col0" >prahokbart_big (BS=32.0)</td>
+      <td id="T_4f546_row6_col1" class="data row6 col1" >99.540</td>
+      <td id="T_4f546_row6_col2" class="data row6 col2" >99.162</td>
+      <td id="T_4f546_row6_col3" class="data row6 col3" >98.809</td>
+      <td id="T_4f546_row6_col4" class="data row6 col4" >98.486</td>
+      <td id="T_4f546_row6_col5" class="data row6 col5" >0.008</td>
+    </tr>
+  </tbody>
+</table>

khmerhomophonecorrector/test_results.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

khmerhomophonecorrector/tool/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

khmerhomophonecorrector/tool/__pycache__/khnormal.cpython-312.pyc ADDED Viewed

Binary file (9.4 kB). View file

khmerhomophonecorrector/tool/balance_data.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import json
+import re
+# === Helper: Clean sentence formatting ===
+def clean_text(text):
+    return re.sub(r"\s+", " ", text).strip()
+# === Load correct_homophone.json ===
+with open("correct_homophone.json", "r", encoding="utf-8") as f:
+    data = json.load(f)
+# === Load homophone_test.json for order ===
+with open("homophone_test.json", "r", encoding="utf-8") as f:
+    homophone_groups = json.load(f)["homophones"]
+    word_order = [word for group in homophone_groups for word in group]
+# === Balance, clean, and limit each word to max 100 sentences ===
+balanced_data = {}
+for word, sentences in data.items():
+    # Clean each sentence
+    cleaned_sentences = [clean_text(s) for s in sentences]
+    # Deduplicate and limit to 100
+    unique_sentences = list(dict.fromkeys(cleaned_sentences))[:100]
+    balanced_data[word] = unique_sentences
+# === Reorder based on homophone_test.json ===
+ordered_data = {word: balanced_data[word] for word in word_order if word in balanced_data}
+# === Add any remaining words not in homophone_test.json ===
+for word in balanced_data:
+    if word not in ordered_data:
+        ordered_data[word] = balanced_data[word]
+# === Save the final output ===
+with open("balanced_correct_homophone.json", "w", encoding="utf-8") as f:
+    json.dump(ordered_data, f, ensure_ascii=False, indent=4)
+print("✅ Done! Output saved to 'balanced_homophone.json'")

khmerhomophonecorrector/tool/clean_data.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import json
+import re
+import os
+def load_homophones(homophone_file):
+    with open(homophone_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    homophone_sets = data["homophones"]
+    sorted_homophones = {tuple(sorted(set(group))): group for group in homophone_sets}
+    return sorted_homophones
+def clean_text(text):
+    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
+    text = re.sub(r'http\S+', '', text)
+    text = re.sub(r'\d+', '', text)
+    text = re.sub(r'[a-zA-Z]+', '', text)
+    text = re.sub(r'[\u2000-\u206F\u25A0-\u25FF]+', '', text)
+    text = re.sub(r'[ៗ៚]', '', text)
+    text = re.sub(r'[<>()!@#$%^&*_+={}\[\]:;"\'\\|/?.,~-]', '', text)
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with one
+    return text.strip()
+def clean_and_combine_txt_files(input_folder, output_file, homophones):
+    combined_cleaned_data = []
+    for filename in os.listdir(input_folder):
+        if filename.endswith(".txt"):
+            file_path = os.path.join(input_folder, filename)
+            try:
+                with open(file_path, "r", encoding="utf-8") as f:
+                    content = f.read()
+                    cleaned = clean_text(content)
+                    if cleaned:
+                        combined_cleaned_data.append(cleaned)
+                print(f"Processed: {filename}")
+            except Exception as e:
+                print(f"Error processing {filename}: {e}")
+    with open(output_file, "w", encoding="utf-8") as out:
+        json.dump(combined_cleaned_data, out, ensure_ascii=False, indent=4)
+    print(f"\n✅ Combined and cleaned content saved to: {output_file}")
+# File paths
+homophone_file = "homophone_test.json"
+input_folder = "data_khmer"
+output_file = "cleaned_combined_articles.json"
+# Load homophones (not used yet in this script)
+homophones = load_homophones(homophone_file)
+# Clean and combine
+clean_and_combine_txt_files(input_folder, output_file, homophones)

khmerhomophonecorrector/tool/combine_homophones.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import json
+import re
+import ijson
+# === Helper: Clean up sentence formatting ===
+def clean_text(text):
+    return re.sub(r"\s+", " ", text).strip()
+# === Load small files normally ===
+with open("correct_homophone.json", "r", encoding="utf-8") as f:
+    segmented_data = json.load(f)
+with open("homophone_test.json", "r", encoding="utf-8") as f:
+    homophone_groups = json.load(f)["homophones"]
+    word_order = [word for group in homophone_groups for word in group]
+    allowed_words = set(word_order)
+# === Stream and Merge balanced_correct_homophone.json ===
+filtered_file = "balanced_homophone.json"
+with open(filtered_file, "r", encoding="utf-8") as f:
+    parser = ijson.kvitems(f, "")
+    for word, new_sentences in parser:
+        if word not in allowed_words:
+            continue  # ⚡ Skip words that are not allowed
+        existing = segmented_data.get(word, [])
+        existing_cleaned = {clean_text(s) for s in existing}
+        new_cleaned = {clean_text(s) for s in new_sentences}
+        merged = sorted(existing_cleaned.union(new_cleaned))
+        segmented_data[word] = merged
+# === Build the final ordered dataset ===
+ordered_data = {}
+for word in word_order:
+    if word in segmented_data:
+        ordered_data[word] = segmented_data[word]
+# === Save final output ===
+with open("finalcorrect_homophone.json", "w", encoding="utf-8") as f:
+    json.dump(ordered_data, f, ensure_ascii=False, indent=4)
+print("✅ Merging complete! Only words from 'homophone_test.json' included. Check 'finalcorrect_homophone.json'.")

khmerhomophonecorrector/tool/complete_homophone_sentences.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import json
+import random
+from collections import defaultdict
+from khmernltk import word_tokenize
+def load_data():
+    try:
+        # Load the original output file
+        with open('incorrect_homophone_sorted.json', 'r', encoding='utf-8') as f:
+            incorrect = json.load(f)
+        # Load the analysis file
+        with open('incorrect_homophone_analysis4.json', 'r', encoding='utf-8') as f:
+            analysis = json.load(f)
+        # Load the correct sentences
+        with open('correct_homophone.json', 'r', encoding='utf-8') as f:
+            correct = json.load(f)
+        # Load homophone groups
+        with open('homophone_test.json', 'r', encoding='utf-8') as f:
+            homophones = json.load(f)['homophones']
+        return incorrect, analysis, correct, homophones
+    except Exception as e:
+        print(f"Error loading files: {e}")
+        return None, None, None, None
+def find_incomplete_homophones(analysis):
+    incomplete = {}
+    for group_name, group_data in analysis['homophone_summary'].items():
+        for word, count in group_data.items():
+            if count < 100:
+                if group_name not in incomplete:
+                    incomplete[group_name] = []
+                incomplete[group_name].append((word, count))
+    return incomplete
+def find_similar_words(word, correct):
+    """Find words that might be similar to the target word"""
+    similar = []
+    for other_word in correct.keys():
+        if other_word != word and len(other_word) == len(word):
+            # Check if they share any characters
+            if any(c in other_word for c in word):
+                similar.append(other_word)
+    return similar
+def generate_additional_sentences(incorrect, correct, homophones, incomplete):
+    # Create mapping from word to its homophone group
+    word_to_group = {}
+    for group in homophones:
+        for word in group:
+            word_to_group[word] = group
+    # Process each incomplete group
+    for group_name, words in incomplete.items():
+        print(f"\nProcessing group: {group_name}")
+        for word, current_count in words:
+            needed = 100 - current_count
+            if needed <= 0:
+                continue
+            print(f"  Generating {needed} more sentences for {word}")
+            # Initialize if not present
+            if word not in incorrect:
+                incorrect[word] = []
+            # Strategy 1: Try to use sentences from the same homophone group
+            group_words = word_to_group.get(word, [])
+            source_words = [w for w in group_words if w in correct and w != word]
+            if source_words:
+                print(f"  Using {len(source_words)} words from same group")
+                attempts = 0
+                max_attempts = needed * 10
+                while len(incorrect[word]) < 100 and attempts < max_attempts:
+                    attempts += 1
+                    source_word = random.choice(source_words)
+                    for sentence in correct[source_word]:
+                        if len(incorrect[word]) >= 100:
+                            break
+                        tokens = word_tokenize(sentence)
+                        positions = [i for i, t in enumerate(tokens) if t == source_word]
+                        if not positions:
+                            continue
+                        new_tokens = tokens.copy()
+                        replace_pos = random.choice(positions)
+                        new_tokens[replace_pos] = word
+                        new_sentence = ''.join(new_tokens)
+                        if new_sentence not in incorrect[word]:
+                            incorrect[word].append(new_sentence)
+                        if len(incorrect[word]) % 20 == 0:
+                            print(f"    {word}: {len(incorrect[word])}/100")
+            # Strategy 2: If still not enough, try similar words
+            if len(incorrect[word]) < 100:
+                print(f"  Trying similar words for {word}")
+                similar_words = find_similar_words(word, correct)
+                if similar_words:
+                    print(f"  Found {len(similar_words)} similar words")
+                    attempts = 0
+                    max_attempts = (100 - len(incorrect[word])) * 10
+                    while len(incorrect[word]) < 100 and attempts < max_attempts:
+                        attempts += 1
+                        source_word = random.choice(similar_words)
+                        for sentence in correct[source_word]:
+                            if len(incorrect[word]) >= 100:
+                                break
+                            tokens = word_tokenize(sentence)
+                            positions = [i for i, t in enumerate(tokens) if t == source_word]
+                            if not positions:
+                                continue
+                            new_tokens = tokens.copy()
+                            replace_pos = random.choice(positions)
+                            new_tokens[replace_pos] = word
+                            new_sentence = ''.join(new_tokens)
+                            if new_sentence not in incorrect[word]:
+                                incorrect[word].append(new_sentence)
+                            if len(incorrect[word]) % 20 == 0:
+                                print(f"    {word}: {len(incorrect[word])}/100")
+            # Final check
+            if len(incorrect[word]) < 100:
+                print(f"  Warning: Could only generate {len(incorrect[word])} sentences for {word}")
+            else:
+                print(f"  Successfully generated 100 sentences for {word}")
+    return incorrect
+def save_results(data, filename='incorrect_homophone_completed.json'):
+    with open(filename, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+def main():
+    incorrect, analysis, correct, homophones = load_data()
+    if not all([incorrect, analysis, correct, homophones]):
+        print("Failed to load data files")
+        return
+    print("Finding incomplete homophones...")
+    incomplete = find_incomplete_homophones(analysis)
+    print(f"\nFound {len(incomplete)} groups with incomplete sentences")
+    for group_name, words in incomplete.items():
+        print(f"\n{group_name}:")
+        for word, count in words:
+            print(f"  {word}: {count}/100")
+    print("\nGenerating additional sentences...")
+    updated_incorrect = generate_additional_sentences(incorrect, correct, homophones, incomplete)
+    # Save the updated results
+    save_results(updated_incorrect)
+    print("\nDone! Results saved to incorrect_homophone_completed.json")
+    # Print final statistics
+    total_words = len(updated_incorrect)
+    total_sentences = sum(len(sentences) for sentences in updated_incorrect.values())
+    print(f"\nFinal statistics:")
+    print(f"Total words: {total_words}")
+    print(f"Total sentences: {total_sentences}")
+    print(f"Average sentences per word: {total_sentences/total_words:.2f}")
+if __name__ == "__main__":
+    main()

khmerhomophonecorrector/tool/convert_format.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import json
+import re
+from typing import List, Tuple, Dict
+def load_files(input_file, homophone_file):
+    """Load input data and homophone groups"""
+    with open(input_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    with open(homophone_file, "r", encoding="utf-8") as f:
+        homophones_data = json.load(f)
+    # Create a mapping of words to their homophone groups
+    homophone_map = {}
+    for group in homophones_data["homophones"]:
+        for word in group:
+            homophone_map[word] = group
+    return data, homophone_map
+def clean_text(text, special_tokens):
+    """Clean text by removing special tokens and normalizing whitespace"""
+    # Remove special tokens and extra spaces
+    words = [w for w in text.strip().split() if w not in special_tokens]
+    return ' '.join(words)
+def strip_punct(word):
+    """Remove Khmer and ASCII punctuation from a word"""
+    return re.sub(r'[\u17d4-\u17d6\u200b\u200c\u200d\u17c9\u17ca\u17cb\u17cc\u17cd\u17ce\u17cf\u17d0\u17d1\u17d2\u17d3\u17d4\u17d5\u17d6\u17d7\u17d8\u17d9\u17da\u17db\u17dc\u17dd\u0021-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u007e\u2026\u201c\u201d\u2018\u2019\u00ab\u00bb\u300c\u300d\u300e\u300f\u3010\u3011\u3014\u3015\u3016\u3017\u3018\u3019\u301a\u301b\u301c\u301d\u301e\u301f\u002e\u002c\u0964\u0965]', '', word)
+def find_homophone_group(word1, word2, homophone_map):
+    """Find if two words are in the same homophone group"""
+    if word1 in homophone_map and word2 in homophone_map:
+        if homophone_map[word1] == homophone_map[word2]:
+            return tuple(sorted(homophone_map[word1]))  # Use tuple for set uniqueness
+    return None
+def find_homophone_pair(input_text: str, output_text: str) -> Tuple[str, str]:
+    """Find the homophone pair by comparing input and output texts."""
+    input_words = input_text.split()
+    output_words = output_text.split()
+    # Find the first different word
+    for i, (in_word, out_word) in enumerate(zip(input_words, output_words)):
+        if in_word != out_word:
+            return in_word, out_word
+    return None, None
+def convert_format(input_file: str, output_file: str):
+    """
+    Convert the dataset format to include proper special tokens and homophone groups.
+    Input format:
+    {
+        "input": "នេះដូច ក៏ ក របស់ទ័ព។",
+        "output": "នេះដូច ក ក របស់ទ័ព។",
+        "error_word": "ក៏",
+        "correct_word": "ក"
+    }
+    Output format:
+    {
+        "input": "នេះដូច ក៏ ក របស់ទ័ព។ </s> <2km>",
+        "target": "<2km> នេះដូច ក ក របស់ទ័ព។ </s>",
+        "homophone_group": ["ក៏", "ក"],
+        "error_type": "homophone"
+    }
+    """
+    # Read the input file
+    with open(input_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    # Convert the format
+    converted_data = []
+    for item in data:
+        # Clean the input and output texts
+        input_text = item['input'].replace('</s>', '').replace('<2km>', '').strip()
+        output_text = item['output'].replace('</s>', '').replace('<2km>', '').strip()
+        # Find the homophone pair
+        error_word = item['error_word']
+        correct_word = item['correct_word']
+        # Create the converted item
+        converted_item = {
+            "input": f"{input_text} </s> <2km>",
+            "target": f"<2km> {output_text} </s>",
+            "homophone_group": [error_word, correct_word],
+            "error_type": "homophone"
+        }
+        converted_data.append(converted_item)
+    # Save the converted data
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(converted_data, f, ensure_ascii=False, indent=2)
+    print(f"Converted {len(converted_data)} samples")
+    print(f"Saved to {output_file}")
+    # Print a sample for verification
+    print("\nSample of converted data:")
+    print(json.dumps(converted_data[0], ensure_ascii=False, indent=2))
+if __name__ == "__main__":
+    input_file = "homophone_error_correction.json"
+    output_file = "homophone_error_correction_converted.json"
+    convert_format(input_file, output_file)

khmerhomophonecorrector/tool/convert_training_data.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import json
+import re
+from typing import List, Dict, Any
+import random
+def load_json_file(file_path: str) -> List[Dict[str, Any]]:
+    """Load JSON file and return its contents."""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+def save_json_file(data: List[Dict[str, Any]], file_path: str):
+    """Save data to JSON file."""
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+def clean_text(text: str) -> str:
+    """Remove special tokens and clean the text."""
+    # Remove special tokens
+    text = text.replace('</s>', '').replace('<2km>', '')
+    # Remove extra spaces
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+def expand_homophone_group(group: List[str]) -> List[str]:
+    """Expand homophone groups to include all common variations."""
+    # Common homophone groups in Khmer
+    homophone_mappings = {
+        'ក': ['ក', 'ក៏', 'ករ', 'ករណ៍'],
+        'ករ': ['ក', 'ក៏', 'ករ', 'ករណ៍'],
+        'ករណ៍': ['ក', 'ក៏', 'ករ', 'ករណ៍'],
+        'ក៏': ['ក', 'ក៏', 'ករ', 'ករណ៍'],
+        # Add more mappings as needed
+    }
+    expanded_group = set()
+    for word in group:
+        if word in homophone_mappings:
+            expanded_group.update(homophone_mappings[word])
+    return list(expanded_group) if expanded_group else group
+def create_natural_context(input_text: str, target_text: str) -> tuple:
+    """Create more natural context by adding surrounding text."""
+    # Common Khmer sentence starters and connectors
+    starters = [
+        "នៅពេលនោះ",
+        "ដូច្នេះ",
+        "ដើម្បី",
+        "ព្រោះ",
+        "ដោយសារ",
+        "នៅក្នុង",
+        "នៅលើ",
+        "នៅពេល",
+    ]
+    # Add random starter if the sentence doesn't start with common patterns
+    if not any(input_text.startswith(s) for s in starters):
+        input_text = f"{random.choice(starters)} {input_text}"
+        target_text = f"{random.choice(starters)} {target_text}"
+    return input_text, target_text
+def convert_format(input_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Convert the data format to match seq2seq_homophone.json."""
+    converted_data = []
+    for item in input_data:
+        # Clean the input and target texts
+        input_text = clean_text(item['input'])
+        target_text = clean_text(item['target'])
+        # Create more natural context
+        input_text, target_text = create_natural_context(input_text, target_text)
+        # Expand homophone group
+        homophone_group = expand_homophone_group(item['homophone_group'])
+        # Create new format
+        new_item = {
+            "input": input_text,
+            "target": target_text,
+            "homophone_group": homophone_group
+        }
+        converted_data.append(new_item)
+    return converted_data
+def main():
+    # Load the input data
+    input_data = load_json_file('homophone_pairs.json')
+    # Convert the format
+    converted_data = convert_format(input_data)
+    # Save the converted data
+    save_json_file(converted_data, 'converted_homophone_pairs.json')
+    print(f"Converted {len(converted_data)} examples")
+    print("Sample of converted data:")
+    for i, item in enumerate(converted_data[:3]):
+        print(f"\nExample {i+1}:")
+        print(f"Input: {item['input']}")
+        print(f"Target: {item['target']}")
+        print(f"Homophone group: {item['homophone_group']}")
+if __name__ == "__main__":
+    main()

khmerhomophonecorrector/tool/debug_homophone_check.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import json
+import re
+# Load the segmented output data (main dataset)
+with open("correct_homophone.json", "r", encoding="utf-8") as f:
+    segmented_data = json.load(f)
+# Load the homophone analysis results
+with open("missing_homophone_analysis.json", "r", encoding="utf-8") as f:
+    homophone_analysis = json.load(f)
+# Extract partially missing and under-80-sentence homophones
+partially_missing = homophone_analysis.get("partially_missing_homophones", {})
+under_80 = homophone_analysis.get("under_80_sentences_homophones", {})
+# Combine the word sets
+debug_words = set()
+for word_data in partially_missing.values():
+    debug_words.update(word_data.keys())
+for word_data in under_80.values():
+    debug_words.update(word_data.keys())
+# Pre-compile regex patterns for each word
+patterns = {word: re.compile(rf'(?<!\S){re.escape(word)}(?!\S)') for word in debug_words}
+# Debug: find sentences to fill up to 80
+debug_results = {}
+for word in debug_words:
+    matches = []  # start fresh
+    pattern = patterns[word]
+    for key, sentences in segmented_data.items():
+        for sentence in sentences:
+            # If sentence is tokenized (list), join it into a normal sentence
+            if isinstance(sentence, list):
+                sentence = ''.join(sentence)  # JOIN WITHOUT SPACE (for Khmer)
+            if pattern.search(sentence):
+                matches.append(sentence)
+                if len(matches) == 80:
+                    break
+        if len(matches) == 80:
+            break
+    debug_results[word] = matches  # already capped at 80
+# Save the results
+with open("homophone_debug_results.json", "w", encoding="utf-8") as f:
+    json.dump(debug_results, f, ensure_ascii=False, indent=4)
+print("✅ Completed filling each word up to 80 sentences with regex matching! Saved to 'homophone_debug_results1.json'.")

khmerhomophonecorrector/tool/filter.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import ijson
+import json
+from collections import OrderedDict
+from tqdm import tqdm
+def load_homophones_ordered(homophone_file):
+    with open(homophone_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    ordered_words = []
+    for group in data["homophones"]:
+        ordered_words.extend(group)
+    return ordered_words
+def filter_sentences_fast(segmented_file, homophone_words, output_file):
+    filtered_results = OrderedDict((word, []) for word in homophone_words)
+    homophone_set = set(homophone_words)
+    with open(segmented_file, "r", encoding="utf-8") as f:
+        parser = ijson.kvitems(f, '')  # key-value pairs
+        for key, sentence_list in tqdm(parser, desc="Filtering"):
+            for sentence in sentence_list:
+                tokens_in_sentence = set(sentence)
+                matched_words = homophone_set.intersection(tokens_in_sentence)
+                for word in matched_words:
+                    filtered_results[word].append(sentence)
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(filtered_results, f, ensure_ascii=False, indent=4)
+    print(f"✅ Fast filtered results saved to {output_file}")
+# === Run ===
+if __name__ == "__main__":
+    homophone_file = "homophone_test.json"
+    segmented_file = "segmented_grouped_cleaned.json"
+    output_file = "filtered_output.json"
+    homophone_words = load_homophones_ordered(homophone_file)
+    filter_sentences_fast(segmented_file, homophone_words, output_file)

khmerhomophonecorrector/tool/homophone_missing.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import json
+def load_homophones(homophone_file):
+    with open(homophone_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    return data["homophones"]
+def load_cleaned_data(cleaned_file):
+    with open(cleaned_file, "r", encoding="utf-8") as f:
+        return json.load(f)
+def analyze_missing_homophones(homophones, cleaned_data):
+    missing_homophones = {}
+    partially_missing_homophones = {}
+    under_80_sentences_homophones = {}
+    between_80_and_99_homophones = {}
+    for homophone_set in homophones:
+        homophone_key = ", ".join(homophone_set)
+        word_counts = {}
+        words_with_0_sentences = {}
+        words_under_80 = {}
+        words_between_80_and_99 = {}
+        all_zero = True
+        for word in homophone_set:
+            sentence_count = len(cleaned_data.get(word, []))
+            word_counts[word] = sentence_count
+            if sentence_count > 0:
+                all_zero = False
+                if sentence_count < 80:
+                    words_under_80[word] = sentence_count
+                elif 80 <= sentence_count < 100:
+                    words_between_80_and_99[word] = sentence_count
+            else:
+                words_with_0_sentences[word] = 0
+                words_under_80[word] = 0  # also include in under_80
+        if all_zero:
+            missing_homophones[homophone_key] = word_counts
+        elif words_with_0_sentences:
+            partially_missing_homophones[homophone_key] = words_with_0_sentences
+        if words_under_80:
+            under_80_sentences_homophones[homophone_key] = words_under_80
+        if words_between_80_and_99:
+            between_80_and_99_homophones[homophone_key] = words_between_80_and_99
+    return (
+        missing_homophones,
+        partially_missing_homophones,
+        under_80_sentences_homophones,
+        between_80_and_99_homophones
+    )
+def save_results(
+    missing_homophones,
+    partially_missing_homophones,
+    under_80_sentences_homophones,
+    between_80_and_99_homophones,
+    output_file
+):
+    results = {
+        "completely_missing_homophones": missing_homophones,
+        "partially_missing_homophones": partially_missing_homophones,
+        "under_80_sentences_homophones": under_80_sentences_homophones,
+        "between_80_and_99_homophones": between_80_and_99_homophones
+    }
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=4)
+    print(f"✅ Missing homophones analysis saved to {output_file}")
+# File paths
+homophone_file = "homophone_test.json"
+cleaned_file = "correct_homophone.json"
+output_file = "missing_homophone_analysis1.json"
+# Load data and analyze
+homophones = load_homophones(homophone_file)
+cleaned_data = load_cleaned_data(cleaned_file)
+missing_h, partial_h, under_80, between_80_99 = analyze_missing_homophones(homophones, cleaned_data)
+# Save results
+save_results(missing_h, partial_h, under_80, between_80_99, output_file)

khmerhomophonecorrector/tool/khnormal.py ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/usr/bin/python3
+# Copyright (c) 2021-2024, SIL Global.
+# Licensed under MIT license: https://opensource.org/licenses/MIT
+import enum, re, regex
+class Cats(enum.Enum):
+    Other = 0; Base = 1; Robat = 2; Coeng = 3;
+    Shift = 4; Z = 5; VPre = 6; VB = 7; VA = 8;
+    VPost = 9; MS = 10; MF = 11; ZFCoeng = 12
+categories =  ([Cats.Base] * 35     # 1780-17A2
+            + [Cats.Other] * 2      # 17A3-17A4
+            + [Cats.Base] * 15      # 17A5-17B3
+            + [Cats.Other] * 2      # 17B4-17B5
+            + [Cats.VPost]          # 17B6
+            + [Cats.VA] * 4         # 17B7-17BA
+            + [Cats.VB] * 3         # 17BB-17BD
+            + [Cats.VPre] * 8       # 17BE-17C5
+            + [Cats.MS]             # 17C6
+            + [Cats.MF] * 2         # 17C7-17C8
+            + [Cats.Shift] * 2      # 17C9-17CA
+            + [Cats.MS]             # 17CB
+            + [Cats.Robat]          # 17CC
+            + [Cats.MS] * 5         # 17CD-17D1
+            + [Cats.Coeng]          # 17D2
+            + [Cats.MS]             # 17D3
+            + [Cats.Other] * 9      # 17D4-17DC
+            + [Cats.MS])            # 17DD
+khres = {   # useful regular sub expressions used later
+    # All bases
+    "B":       "[\u1780-\u17A2\u17A5-\u17B3\u25CC]",
+    # All consonants excluding Ro
+    "NonRo":   "[\u1780-\u1799\u179B-\u17A2\u17A5-\u17B3]",
+    # All consonants exclude Bo
+    "NonBA":   "[\u1780-\u1793\u1795-\u17A2\u17A5-\u17B3]",
+    # Series 1 consonants
+    "S1":      "[\u1780-\u1783\u1785-\u1788\u178A-\u178D\u178F-\u1792"
+               "\u1795-\u1797\u179E-\u17A0\u17A2]",
+    # Series 2 consonants
+    "S2":      "[\u1784\u1780\u178E\u1793\u1794\u1798-\u179D\u17A1\u17A3-\u17B3]",
+    # Simple following Vowel in Modern Khmer
+    "VA":      "(?:[\u17B7-\u17BA\u17BE\u17BF\u17DD]|\u17B6\u17C6)",
+    # Above vowel (as per shifter rules) with vowel sequences
+    "VAX":     "(?:[\u17C1-\u17C5]?{VA})",
+    # Above vowel with samyok (modern khmer)
+    "VAS":     "(?:{VA}|[\u17C1-\u17C3]?\u17D0)",
+    # Above vowel with samyok (middle khmer)
+    "VASX":    "(?:{VAX}|[\u17C1-\u17C3]?\u17D0)",
+    # Below vowel (with Middle Khmer prefix)
+    "VB":      "(?:[\u17C1-\u17C3]?[\u17BB-\u17BD])",
+    # contains series 1 and no BA
+    "STRONG":  """  {S1}\u17CC?                 # series 1 robat?\n                    (?:\u17D2{NonBA}            # nonba coengs\n                       (?:\u17D2{NonBA})?)?\n                  | {NonBA}\u17CC?              # nonba robat?\n                    (?:  \u17D2{S1}               # series 1 coeng\n                         (?:\u17D2{NonBA})?       #   + any nonba coeng\n                       | \u17D2{NonBA}\u17D2{S1}  # nonba coeng + series 1 coeng\n                    )""",
+    # contains BA or only series 2
+    "NSTRONG": """(?:{S2}\u17CC?(?:\u17D2{S2}(?:\u17D2{S2})?)? # Series 2 + series 2 coengs\n                     |\u1794\u17CC?(?:{COENG}(?:{COENG})?)?    # or ba with any coeng\n                     |{B}\u17CC?(?:\u17D2{NonRo}\u17D2\u1794   # or ba coeng\n                                  |\u17D2\u1794(?:\u17D2{B})))""",
+    "COENG":   "(?:(?:\u17D2{NonRo})?\u17D2{B})",
+    # final coeng
+    "FCOENG":  "(?:\u200D(?:\u17D2{NonRo})+)",
+    # Allowed shifter sequences in Modern Khmer
+    "SHIFT":   """(?:  (?<={STRONG}) \u17CA\u200C (?={VA})     # strong + triisap held up\n                     | (?<={NSTRONG})\u17C9\u200C (?={VAS})    # weak + muusikatoan held up\n                     | [\u17C9\u17CA]                          # any shifter\n                  )""",
+    # Allowed shifter sequences in Middle Khmer
+    "SHIFTX":  """(?:(?<={STRONG}) \u17CA\u200C (?={VAX})      # strong + triisap held up\n                    | (?<={NSTRONG})\u17C9\u200C (?={VASX})    # weak + muusikatoan held up\n                    | [\u17C9\u17CA]                           # any shifter\n                  )""",
+    # Modern Khmer vowel
+    "V":       "[\u17B6-\u17C5]?",
+    # Middle Khmer vowel sequences (not worth trying to unpack this)
+    "VX":      "(?:\u17C1[\u17BC\u17BD]?[\u17B7\u17B9\u17BA]?|"
+               "[\u17C2\u17C3]?[\u17BC\u17BD]?[\u17B7-\u17BA]\u17B6|"
+               "[\u17C2\u17C3]?[\u17BB-\u17BD]?\u17B6|\u17BE[\u17BC\u17BD]?\u17B6?|"
+               "[\u17C1-\u17C5]?\u17BB(?![\u17D0\u17DD])|"
+               "[\u17BF\u17C0]|[\u17C2-\u17C5]?[\u17BC\u17BD]?[\u17B7-\u17BA]?)",
+    # Modern Khmer Modifiers
+    "MS":      """(?:(?:  [\u17C6\u17CB\u17CD-\u17CF\u17D1\u17D3]   # follows anything\n                       | (?<!\u17BB) [\u17D0\u17DD])                # not after -u\n                     [\u17C6\u17CB\u17CD-\u17D1\u17D3\u17DD]?  # And an optional second\n                  )""",
+    # Middle Khmer Modifiers
+    "MSX":     """(?:(?:  [\u17C6\u17CB\u17CD-\u17CF\u17D1\u17D3]   # follows anything\n                        | (?<!\u17BB [\u17B6\u17C4\u17C5]?)       # blocking -u sequence\n                        [\u17D0\u17DD])                           # for these modifiers\n                     [\u17C6\u17CB\u17CD-\u17D1\u17D3\u17DD]? # And an optional second\n                  )"""
+}
+# expand 3 times: SHIFTX -> VASX -> VAX -> VA
+for i in range(3):
+    khres = {k: v.format(**khres) for k, v in khres.items()}
+def charcat(c):
+    ''' Returns the Khmer character category for a single char string'''
+    o = ord(c)
+    if 0x1780 <= o <= 0x17DD:
+        return categories[o-0x1780]
+    elif o == 0x200C:
+        return Cats.Z
+    elif o == 0x200D:
+        return Cats.ZFCoeng
+    return Cats.Other
+def lunar(m, base):
+    ''' Returns the lunar date symbol from the appropriate set base '''
+    v = (ord(m.group(1) or "\u17E0") - 0x17E0) * 10 + ord(m.group(2)) - 0x17E0
+    if v > 15:      # translate \u17D4\u17D2\u17E0 as well
+        return m.group(0)
+    return chr(v+base)
+def khnormal(txt, lang="km"):
+    ''' Returns khmer normalised string, without fixing or marking errors'''
+    # Mark final coengs in Middle Khmer
+    if lang == "xhm":
+        txt = re.sub(r"([\u17B6-\u17C5]\u17D2)", "\u200D\\1", txt)
+    # Categorise every character in the string
+    charcats = [charcat(c) for c in txt]
+    # Recategorise base -> coeng after coeng char (or ZFCoeng)
+    for i in range(1, len(charcats)):
+        if txt[i-1] in "\u200D\u17D2" and charcats[i] in (Cats.Base, Cats.Coeng):
+            charcats[i] = charcats[i-1]
+    # Find subranges of base+non other and sort components in the subrange
+    i = 0
+    res = []
+    while i < len(charcats):
+        c = charcats[i]
+        if c != Cats.Base:
+            res.append(txt[i])
+            i += 1
+            continue
+        # Scan for end of syllable
+        j = i + 1
+        while j < len(charcats) and charcats[j].value > Cats.Base.value:
+            j += 1
+        # Sort syllable based on character categories
+        # Sort the char indices by category then position in string
+        newindices = sorted(range(i, j), key=lambda e:(charcats[e].value, e))
+        replaces = "".join(txt[n] for n in newindices)
+        replaces = re.sub("(\u200D?\u17D2)[\u17D2\u200C\u200D]+",
+                          r"\1", replaces)      # remove multiple invisible chars
+        replaces = re.sub("\u17BE\u17B6", "\u17C4\u17B8", replaces)  # confusable vowels
+        # map compoound vowel sequences to compounds with -u before to be converted
+        replaces = re.sub("\u17C1([\u17BB-\u17BD]?)\u17B8", "\u17BE\\1", replaces)
+        replaces = re.sub("\u17C1([\u17BB-\u17BD]?)\u17B6", "\u17C4\\1", replaces)
+        replaces = re.sub("(\u17BE)(\u17BB)", r"\2\1", replaces)
+        # Replace -u + upper vowel with consonant shifter
+        replaces = re.sub(("((?:{STRONG})[\u17C1-\u17C5]?)\u17BB" + \
+                          "(?={VA}|\u17D0)").format(**khres), "\\1\u17CA",
+                          replaces, flags=re.X)
+        replaces = re.sub(("((?:{NSTRONG})[\u17C1-\u17C5]?)\u17BB" + \
+                          "(?={VA}|\u17D0)").format(**khres), "\\1\u17C9",
+                          replaces, flags=re.X)
+        replaces = re.sub("(\u17D2\u179A)(\u17D2[\u1780-\u17B3])",
+                          r"\2\1", replaces)    # coeng ro second
+        replaces = re.sub("(\u17D2)\u178A", "\\1\u178F", replaces)  # coeng da->ta
+        # convert lunar dates from old style to use lunar date symbols
+        replaces = re.sub("(\u17E1?)([\u17E0-\u17E9])\u17D2\u17D4",
+                lambda m:lunar(m, 0x19E0), replaces)
+        replaces = re.sub("\u17D4\u17D2(\u17E1?)([\u17E0-\u17E9])",
+                lambda m:lunar(m, 0x19F0), replaces)
+        replaces = re.sub("\u17D4\u17D2\u17D4", "\u19F0", replaces)
+        res.append(replaces)
+        i = j
+    return "".join(res)
+# The rest of the script (CLI, khtest, etc.) is omitted for import use.

khmerhomophonecorrector/tool/normalize_khmer.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import json
+import sys
+import os
+# Add the current directory to Python path
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_dir)
+from khnormal import khnormal
+def normalize_khmer_text(text):
+    """Normalize Khmer text using khnormal."""
+    return khnormal(text)
+def process_json_file(input_file, output_file):
+    """Process JSON file and normalize Khmer text."""
+    try:
+        # Read the input JSON file
+        with open(input_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        # Normalize each word in the pairs
+        normalized_data = []
+        for pair in data:
+            normalized_pair = {
+                'input': normalize_khmer_text(pair['input']),
+                'target': normalize_khmer_text(pair['target']),
+                'homophone_group': pair['homophone_group']
+            }
+            normalized_data.append(normalized_pair)
+        # Write the normalized data to output file
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(normalized_data, f, ensure_ascii=False, indent=2)
+        print(f"Successfully normalized Khmer text and saved to {output_file}")
+    except Exception as e:
+        print(f"Error processing file: {str(e)}")
+        sys.exit(1)
+if __name__ == "__main__":
+    # Get the parent directory path
+    parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    files_to_process = [
+        (os.path.join(parent_dir, "data/train.json"), os.path.join(parent_dir, "data/train_normalized.json")),
+        (os.path.join(parent_dir, "data/test.json"), os.path.join(parent_dir, "data/test_normalized.json")),
+        (os.path.join(parent_dir, "data/val.json"), os.path.join(parent_dir, "data/val_normalized.json"))
+    ]
+    for input_file, output_file in files_to_process:
+        process_json_file(input_file, output_file)

khmerhomophonecorrector/tool/segmentation.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import ijson
+import json
+from khmernltk import sentence_tokenize, word_tokenize
+from collections import defaultdict
+from tqdm import tqdm
+INPUT_FILE = "correct_homophone.json"
+HOMOPHONE_FILE = "homophone_test.json"
+OUTPUT_FILE = "segmented_grouped.json"
+def load_target_keys(homophone_file):
+    with open(homophone_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    # Flatten the list of homophone sets
+    return [word for group in data["homophones"] for word in group]
+def process_sentences_from_key(key, data):
+    segmented = []
+    for article in data:
+        try:
+            sentences = sentence_tokenize(article)
+            for sentence in sentences:
+                tokens = word_tokenize(sentence)
+                segmented.append(tokens)
+        except Exception as e:
+            print(f"❌ Error in '{key}': {e}")
+    return segmented
+def main():
+    print("🚀 Segmenting with homophone grouping...")
+    target_keys = load_target_keys(HOMOPHONE_FILE)
+    results = {}
+    with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        parser = ijson.kvitems(f, "")
+        for key, value in tqdm(parser, desc="Processing keys"):
+            if key in target_keys:
+                results[key] = process_sentences_from_key(key, value)
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f_out:
+        json.dump(results, f_out, ensure_ascii=False, indent=4)
+    print(f"\n✅ Done! Output saved to: {OUTPUT_FILE}")
+if __name__ == "__main__":
+    main()