Spaces:

tartuNLP
/

smugri4-1808-ep3

Sleeping

App Files Files Community

Rasmus Lellep commited on 14 days ago

Commit

76b1ec5

1 Parent(s): 9e93eb6

add loader

Browse files

Files changed (26) hide show

kuidastaltsutadalaamat/.gitignore +162 -0
kuidastaltsutadalaamat/LICENSE +21 -0
kuidastaltsutadalaamat/README.md +2 -0
kuidastaltsutadalaamat/aux.py +212 -0
kuidastaltsutadalaamat/data.py +142 -0
kuidastaltsutadalaamat/inference.py +170 -0
kuidastaltsutadalaamat/legacy/accel.py +328 -0
kuidastaltsutadalaamat/legacy/accel_backup.py +237 -0
kuidastaltsutadalaamat/legacy/benchmark.py +190 -0
kuidastaltsutadalaamat/legacy/data.py +164 -0
kuidastaltsutadalaamat/legacy/data_backup.py +804 -0
kuidastaltsutadalaamat/legacy/diffmdl.py +69 -0
kuidastaltsutadalaamat/legacy/initmodel.py +46 -0
kuidastaltsutadalaamat/legacy/langconv.py +260 -0
kuidastaltsutadalaamat/legacy/localizemodel.py +45 -0
kuidastaltsutadalaamat/legacy/modelops.py +122 -0
kuidastaltsutadalaamat/legacy/oldtrainllm.py +90 -0
kuidastaltsutadalaamat/legacy/parasynth.py +139 -0
kuidastaltsutadalaamat/legacy/pretok.py +65 -0
kuidastaltsutadalaamat/legacy/testmem.py +100 -0
kuidastaltsutadalaamat/legacy/tokops.py +350 -0
kuidastaltsutadalaamat/legacy/trainmodel.py +96 -0
kuidastaltsutadalaamat/legacy/translate_backup.py +309 -0
kuidastaltsutadalaamat/metrics.py +79 -0
kuidastaltsutadalaamat/promptops.py +70 -0
kuidastaltsutadalaamat/trainllm.py +252 -0

kuidastaltsutadalaamat/.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

kuidastaltsutadalaamat/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 TartuNLP
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

kuidastaltsutadalaamat/README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Kuidas taltsutada laamat
2	+ Implementation of LLM continued training and inference.

kuidastaltsutadalaamat/aux.py ADDED Viewed

	@@ -0,0 +1,212 @@

+#!/usr/bin/env python3
+import numpy as np
+import pickle
+import re
+import sys
+from datetime import datetime
+def log(msg, accelerator=None, all_threads=False):
+    if accelerator is not None and all_threads:
+        report_proc = f" ({accelerator.process_index+1}/{accelerator.num_processes})"
+    else:
+        report_proc = ""
+    if accelerator is None or accelerator.is_main_process or all_threads:
+        sys.stderr.write(str(datetime.now()) + report_proc + ": " + msg + '\n')
+def _same_line_log(msg, len_to_del=0):
+    """if sys.stderr.isatty():
+        if len_to_del > 0:
+            sys.stderr.write("\b" * len_to_del)
+        new_len = len(msg)
+        sys.stderr.write(msg)
+        sys.stderr.flush()
+        return new_len
+    else:"""
+    log(msg)
+def debug(msg):
+    pass
+    ### log("\n(DEBUG) " + msg)
+def maybe_convert(value):
+    try:
+        return int(value)
+    except (ValueError, TypeError):
+        try:
+            return float(value)
+        except (ValueError, TypeError):
+            return value
+def get_changed_config(conf, args):
+    arg_dict = args.to_dict()
+    for kwarg in arg_dict:
+        if hasattr(conf, kwarg) and arg_dict[kwarg] is not None:
+            setattr(conf, kwarg, maybe_convert(arg_dict[kwarg]))
+    return conf
+class SameLineLogger:
+    def __init__(self, epoch_len, epoch_num, data_state):
+        self.epoch_len = epoch_len
+        self.epoch_num = epoch_num
+        self.start_global_step = epoch_len * data_state.epoch_idx + data_state.elem_idx
+        self.totalx = epoch_len * epoch_num
+        self.log_after = []
+        self.log_len = 0
+        self.start_time = datetime.now()
+    def line_start(self):
+        _same_line_log(str(datetime.now()) + ": training batches ")
+    def step(self, global_batch_idx, epoch_batch_idx, epoch_idx, loss, lr, grad):
+        passed_time = datetime.now() - self.start_time
+        time_per_batch = passed_time / (global_batch_idx - self.start_global_step)
+        prediction = time_per_batch * (self.totalx - global_batch_idx)
+        msg = f"{epoch_batch_idx} / {self.epoch_len}, epoch {epoch_idx + 1} / {self.epoch_num}, loss={loss}, avg {time_per_batch}/iter, {prediction} to finish, LR={lr:.2e}, grad={grad:.2e}        "
+        new_len = _same_line_log(msg, self.log_len)
+        self.log_len = new_len
+    def line_break(self):
+        sys.stderr.write("\n")
+class CmdlineArgs:
+    def __init__(self,
+                 description,
+                 pos_arg_list=None,
+                 pos_arg_types=None,
+                 kw_arg_dict=None,
+                 input_args=None):
+        self.description = description
+        self.raw_pos_arg_list = pos_arg_list if pos_arg_list is not None else []
+        self.raw_pos_arg_types = pos_arg_types \
+            if pos_arg_types is not None \
+            else [None] * len(self.raw_pos_arg_list)
+        self.kw_arg_dict_with_defaults = kw_arg_dict if kw_arg_dict is not None else {}
+        kw_vals, cmdline_values = self._to_kwargs(sys.argv[1:] if input_args is None else input_args)
+        self._maybe_help(cmdline_values)
+        self._handle_positional_args(cmdline_values)
+        self._handle_keyword_args(kw_vals)
+    @staticmethod
+    def _to_kwargs(arg_list):
+        key_args = dict(raw_entry.lstrip("-").split("=") for raw_entry in arg_list if "=" in raw_entry)
+        filtered_arg_list = [arg for arg in arg_list if "=" not in arg]
+        return key_args, filtered_arg_list
+    def _handle_keyword_args(self, kw_vals):
+        for kw in self.kw_arg_dict_with_defaults:
+            if kw in kw_vals:
+                val = self._convert_kw(kw_vals, kw)
+                del kw_vals[kw]
+            else:
+                val = self.kw_arg_dict_with_defaults[kw]
+            setattr(self, kw, val)
+        if kw_vals:
+            extra_keys = ", ".join(kw_vals.keys())
+            msg = f"command-line keyword arguments '{extra_keys}' are not recognized."
+            self._help_message_and_die(extra=msg)
+    def _convert_kw(self, kw_vals, kw):
+        if self.kw_arg_dict_with_defaults[kw] is None:
+            return kw_vals[kw]
+        else:
+            this_typ = type(self.kw_arg_dict_with_defaults[kw])
+            try:
+                return this_typ(kw_vals[kw])
+            except ValueError:
+                self._help_message_and_die(extra=f"could not convert '{kw_vals[kw]}' to '{this_typ}'")
+    def _sanity_check_pos_args(self, cmdline_values):
+        cmdline_len = len(cmdline_values)
+        if cmdline_len < len(self.raw_pos_arg_list):
+            self._help_message_and_die(
+                extra=f"positional arguments missing: {', '.join(self.raw_pos_arg_list[cmdline_len:])}")
+        if cmdline_len > len(self.raw_pos_arg_list):
+            self._help_message_and_die(
+                extra=f"superfluous positional arguments: {', '.join(cmdline_values[len(self.raw_pos_arg_list):])}")
+    def _handle_positional_args(self, cmdline_values):
+        self._sanity_check_pos_args(cmdline_values)
+        for arg, val, typ in zip(self.raw_pos_arg_list, cmdline_values, self.raw_pos_arg_types):
+            try:
+                val = val if typ is None else typ(val)
+            except ValueError:
+                self._help_message_and_die(extra=f"could not convert '{val}' to '{typ}'")
+            setattr(self, arg, val)
+    def _maybe_help(self, cmdline_values):
+        if len(cmdline_values) == 1 and cmdline_values[0] in {"--help", "-h", "-?"}:
+            self._help_message_and_die()
+    def _help_message_and_die(self, extra=None):
+        sys.stderr.write("Help message: " + self.description + "\n")
+        if self.raw_pos_arg_list:
+            args_descr = ", ".join([f"'{arg}' ({typ.__name__  if typ is not None else 'any'})"
+                                    for arg, typ in zip(self.raw_pos_arg_list, self.raw_pos_arg_types)])
+            sys.stderr.write(f"Positional arguments: {args_descr}\n")
+        if self.kw_arg_dict_with_defaults:
+            kw_descr = ", ".join([f"'{kw}' (default: {val})"
+                                  for kw, val in self.kw_arg_dict_with_defaults.items()])
+            sys.stderr.write(f"Keyword arguments: {kw_descr}\n")
+        if extra is not None:
+            sys.stderr.write("Error: " + extra + "\n")
+        sys.stderr.write("\n")
+        sys.exit(-1)
+    def to_dict(self):
+        return {k: v for k, v in self.__dict__.items()
+                if k not in {'description', 'raw_pos_arg_list', 'raw_pos_arg_types', 'kw_arg_dict_with_defaults'}}
+    def __str__(self):
+        return str(self.to_dict())
+    def __repr__(self):
+        return self.__str__()
+if __name__ == "__main__":
+    for dname in sys.argv[1:]:
+        d = np.load(dname + "/custom_checkpoint_1.pkl", allow_pickle=True)
+        p = pickle.loads(d['custom_checkpoint_1/data.pkl'])
+        print(dname, p)

kuidastaltsutadalaamat/data.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#!/usr/bin/env python3
+from .promptops import *
+import json
+import sys
+from random import shuffle
+from torch.utils.data import Dataset as TorchDataset, DataLoader
+from .aux import log
+def tokenize_str(tokenizer, entry, add_eos=True, max_len=3000, for_inf=False):
+    if for_inf:
+        tokens = tokenizer(
+            entry,
+            truncation=True,
+            max_length=max_len,
+            return_attention_mask=True,
+            return_tensors="pt"
+        )
+    else:
+        tokens = tokenizer(
+            entry,
+            truncation=True,
+            max_length=max_len,
+            return_attention_mask=True
+        )
+    if add_eos:
+        tokens['attention_mask'].append(1)
+        tokens['input_ids'].append(tokenizer.eos_token_id)
+    return tokens
+"""
+Load texts into memory and allow to loop through it,
+returning tokenized tensors.
+Currently no support for text data that does not fit into memory,
+need to add it. Or do HF datasets have something out of the box?
+"""
+class LazyTokenizingDataset(TorchDataset):
+    def __init__(self, texts, tokenizer, max_length=512, prompt_format="raw"):
+        self.texts = texts
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.prompt_format = prompt_format
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        # Return plain Python lists; let the collator pad & build labels.
+        entry = self.texts[idx]
+        prompt = prep_prompt(entry, self.prompt_format)
+        return tokenize_str(self.tokenizer, prompt)
+class LazyTokenizingInferenceDataset(TorchDataset):
+    def __init__(self, texts, tokenizer, prompt_format, max_length=512, debug=False):
+        self.texts = texts
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.prompt_format = prompt_format
+        self.debug = debug
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        entry = self.texts[idx]
+        prompt = prep_prompt(entry, self.prompt_format, inference=True)
+        result = tokenize_str(self.tokenizer, prompt, add_eos=False, for_inf=True)
+        if self.debug:
+            log(f"Input: {prompt}")
+            log(f"Tokenized: {result}")
+        return result
+def read_input(path, formt):
+    if path is None:
+        log("Reading from STDIN")
+        fh = sys.stdin
+    else:
+        fh = open(path, 'r')
+    if formt == PF_RAW:
+        result = [fh.read()]
+    elif formt == PF_RAWLINES:
+        result = fh.readlines()
+    else:
+        result = json.load(fh)
+    return result
+def get_data_loader(path, prompt_format, tokenizer, debug=False):
+    inputs = read_input(path, prompt_format)
+    dataset = LazyTokenizingInferenceDataset(inputs, tokenizer, prompt_format, debug=debug)
+    """
+    data_coll = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,
+        pad_to_multiple_of=None,  # helps performance; set None if you prefer exact lengths
+    )
+    data_loader = DataLoader(dataset, collate_fn=data_coll, batch_size=1)
+    """
+    return dataset
+def load_training_data(path, tokenizer, cmd_args):
+    with open(path, "r") as f:
+        data = json.load(f)
+    train_set_iter = LazyTokenizingDataset(data, tokenizer, cmd_args.max_length, cmd_args.prompt_format)
+    return train_set_iter
+if __name__ == '__main__':
+    all_data = []
+    for input_file in sys.argv[1:]:
+        with open(input_file, "r") as f:
+            this_data = json.load(f)
+            all_data += this_data
+    shuffle(all_data)
+    json.dump(all_data, sys.stdout)

kuidastaltsutadalaamat/inference.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/env python3
+from .promptops import *
+from .aux import CmdlineArgs, log
+from .data import get_data_loader
+from .trainllm import env_stuff, load_model, load_tokenizer
+import sys
+import torch
+import json
+import torch.distributed as dist
+from accelerate import Accelerator
+from datetime import datetime
+"""
+This currently assumes the batch size to be 1. With larger batches the padding tokens went
+into the decoder. Right-padding as a solution?
+"""
+def llm_generate(model, tokenizer, tok_batch, debug=False, max_len=2000):
+    tok_batch['input_ids'] = tok_batch['input_ids'].to(model.device)
+    tok_batch['attention_mask'] = tok_batch['attention_mask'].to(model.device)
+    start_time = datetime.now()
+    if debug:
+        log(f"Tokenized input: {tok_batch['input_ids']}")
+    raw_output_toks = model.generate(**tok_batch, tokenizer=tokenizer,
+                                 do_sample=False, num_beams=4, max_length=max_len, top_p=None, temperature=None,
+                                 eos_token_id=[tokenizer.eos_token_id,
+                                               tokenizer.convert_tokens_to_ids("<|reserved_special_token_14|>")])
+    #clean_output_toks = remove_prompt_from_output(tok_batch['attention_mask'], raw_output_toks, filler_id)
+    assert len(raw_output_toks) == 1, "Only batch size=1 supported %-("
+    gen_idx = len(tok_batch['attention_mask'][0])
+    if debug:
+        log(f"Full tokenized output: {raw_output_toks[0]}")
+        log(f"Full tokens: {tokenizer.convert_ids_to_tokens(raw_output_toks[0])}")
+        full_out = tokenizer.batch_decode([raw_output_toks[0]], skip_special_tokens=True)
+        log(f"Full text: {full_out[0]}")
+    clean_output_toks = raw_output_toks[0][gen_idx:]
+    clean_outputs = tokenizer.batch_decode([clean_output_toks], skip_special_tokens=True)
+    if debug:
+        log(f"Pruned tokenized output: {clean_output_toks}")
+        log(f"Pruned tokens: {tokenizer.convert_ids_to_tokens(clean_output_toks)}")
+        log(f"Cleaned output: {clean_outputs[0]}")
+        end_time = datetime.now()
+        log(f"This took: {end_time - start_time}")
+    return clean_outputs
+def reassemble_multi(list_of_lists):
+    result = []
+    for gen_idx in range(len(list_of_lists[0])):
+        for i in range(len(list_of_lists)):
+            if gen_idx < len(list_of_lists[i]):
+                result.append(list_of_lists[i][gen_idx])
+    return result
+def predict(model, tokenizer, data_loader, accel, multi=False, debug=False, max_len=2000):
+    outs_final = []
+    with torch.no_grad():
+        for idx, batch in enumerate(data_loader):
+            if idx % accel.num_processes == accel.process_index:
+                start_time = datetime.now()
+                outputs = llm_generate(model, tokenizer, batch, debug=debug, max_len=max_len)
+                end_time = datetime.now()
+                log(f"Generated for {idx} in proc {accel.process_index} in {end_time - start_time}")
+                outs_final += outputs
+    if multi:
+        accel.wait_for_everyone()
+        rank0_buffer = [None] * accel.num_processes if accel.is_main_process else None
+        dist.gather_object(outs_final, rank0_buffer, dst=0)
+        if accel.is_main_process:
+            outs_final = reassemble_multi(rank0_buffer)
+        else:
+            outs_final = None
+    return outs_final
+def _cmdline_args():
+    inputs = sys.argv[1:]
+    description = """Predict output for an input via prompting"""
+    pos_args = ["mdl_id"]
+    #post-process the arguments
+    args = CmdlineArgs(description, pos_args, input_args=inputs,
+                       kw_arg_dict={"debug": False,
+                                    "input_file": "none",
+                                    "output_file": "none",
+                                    "multiproc": False,
+                                    "max_len": 2000,
+                                    "prompt_format": PF_ALPACA})
+    if args.input_file == "none":
+        args.input_file = None
+    if args.output_file == "none":
+        args.output_file = None
+    log(f"Launched as {args}")
+    return args
+def save_all(outputs, args, acc):
+    if acc.is_main_process:
+        if args.output_file is None:
+            log("Writing to STDOUT")
+            out_fh = sys.stdout
+        else:
+            out_fh = open(args.output_file, "w")
+        if args.prompt_format in {PF_RAW, PF_RAWLINES}:
+            for line in outputs:
+                out_fh.write(line + "\n")
+        else:
+            json.dump(outputs, out_fh)
+def and_i_called_this_function_do_main_too():
+    args = _cmdline_args()
+    if args.multiproc:
+        env_stuff()
+    acc = Accelerator()
+    device = acc.device
+    log(f"Device: {device}.", accelerator=acc)
+    if not args.multiproc and not acc.is_main_process:
+        log("Not launched in multi-processing mode, exiting non-main process.")
+        sys.exit(0)
+    tokenizer = load_tokenizer(args.mdl_id, acc)
+    data_loader = get_data_loader(args.input_file, args.prompt_format, tokenizer, debug=args.debug)
+    model = load_model(args.mdl_id, device, acc, attention="eager")
+    model.eval()
+    log(f"Device: {model.device}.", accelerator=acc)
+    log("Model loaded, starting to generate")
+    outputs = predict(model, tokenizer, data_loader, acc, multi=args.multiproc, debug=args.debug, max_len=args.max_len)
+    save_all(outputs, args, acc)
+    log("Done")
+if __name__ == "__main__":
+    and_i_called_this_function_do_main_too()

kuidastaltsutadalaamat/legacy/accel.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import os
+import torch
+from accelerate import Accelerator
+from datetime import datetime
+from transformers import get_scheduler
+from aux import SameLineLogger, log
+from data import DataState, BatchingIterator
+from modelops import save_all_models, report_devices
+def chain_params(coupling_specs):
+    for spec in coupling_specs:
+        yield from spec.model.parameters()
+class TrainLossList:
+    def __init__(self):
+        self.data = []
+    def append(self, loss_val, sub_batch_idx, epoch_batch_idx, _epoch_idx):
+        self.data.append((loss_val, sub_batch_idx, epoch_batch_idx, _epoch_idx))
+    def state_dict(self):
+        return {'data': self.data}
+    def load_state_dict(self, state_dict):
+        self.data = state_dict['data']
+class SwitchingAccelerator:
+    def __init__(self, train_set, train_kwargs, model, tokenizer, preinit_acc=None):
+        self.kwargs = train_kwargs
+        self.train_set_iter = BatchingIterator(train_set, self.kwargs.batch_size, tokenizer, train_kwargs.max_length)
+        self.model = model
+        self.tokenizer = tokenizer
+        self.train_loss_list = TrainLossList()
+        self.data_state = DataState(epoch_idx=0)
+        self._init_acc_and_stuff(preinit_acc)
+        self._init_time_keepers()
+    def _init_time_keepers(self):
+        if self.kwargs.log_steps < 0 and self.accelerator.is_main_process:
+            t = datetime.now()
+            self._tk_zero = t - t
+            self._tk_stats = {}
+            self._tk_time = {}
+    def _add_timekeeper(self, msg):
+        if self.kwargs.log_steps < 0 and self.accelerator.is_main_process:
+            self._tk_stats[msg] = []
+            self._tk_time[msg] = None
+    def _add_timekeepers(self, msgs):
+        for msg in msgs:
+            self._add_timekeeper(msg)
+    def _tk_start(self, msg):
+        if self.kwargs.log_steps < 0 and self.accelerator.is_main_process:
+            assert self._tk_time[msg] is None
+            self._tk_time[msg] = datetime.now()
+    def _tk_stop(self, msg):
+        if self.kwargs.log_steps < 0 and self.accelerator.is_main_process:
+            assert self._tk_time[msg] is not None
+            this_time = datetime.now() - self._tk_time[msg]
+            self._tk_time[msg] = None
+            self._tk_stats[msg].append(this_time)
+            log(f"{msg} took {this_time}, avg time: " +
+                f" {sum(self._tk_stats[msg], self._tk_zero) / len(self._tk_stats[msg])}" +
+                f" over {len(self._tk_stats[msg])} samples")
+    def __handle_accum(self):
+        assert self.kwargs.batch_size % (self.accelerator.num_processes * self.kwargs.nr_sents_per_gpu) == 0,\
+            "batch size must be divisible by number of processes and number of segments per GPU"
+        accum_steps = int((self.kwargs.batch_size / self.accelerator.num_processes) / self.kwargs.nr_sents_per_gpu)
+        self.accelerator.gradient_accumulation_steps = accum_steps
+        log(f"Nr sents/GPU: {self.kwargs.nr_sents_per_gpu}, accum steps: {accum_steps}, " +
+            f"nr. procs: {self.accelerator.num_processes}, batch size: {self.kwargs.batch_size}",
+            accelerator=self.accelerator)
+    def ___get_train_scalars(self):
+        epoch_len = len(self.train_set_iter)
+        train_len = epoch_len * self.kwargs.epochs
+        num_warmup = 0 #int(train_len * 0.01)
+        log(f"Warmup steps: {num_warmup}, epoch len: {epoch_len}, train len: {train_len}", accelerator=self.accelerator)
+        return train_len, num_warmup
+    def __init_opt_lr_and_what_else(self):
+        train_len, num_warmup = self.___get_train_scalars()
+        opt = torch.optim.AdamW(self.model.parameters(), lr=self.kwargs.lr)
+        numtr = train_len * self.accelerator.num_processes
+        lr_scheduler = get_scheduler("linear", optimizer=opt, num_warmup_steps=num_warmup, num_training_steps=numtr)
+        self.optimizer, self.lr_scheduler, self.model = self.accelerator.prepare(opt, lr_scheduler, self.model)
+        self.accelerator.register_for_checkpointing(self.data_state, self.train_loss_list)
+    def _init_acc_and_stuff(self, preinit_acc=None):
+        #self.accelerator = Accelerator(gradient_accumulation_steps=self.kwargs.accum_steps, kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)])
+        if preinit_acc is None:
+            self.accelerator = Accelerator()
+        else:
+            self.accelerator = preinit_acc
+        self.__handle_accum()
+        self.__init_opt_lr_and_what_else()
+        if self.kwargs.continue_training:
+            self.accelerator.load_state(self.kwargs.mdl_id)
+            log(f"Reloaded data state: {self.data_state}", accelerator=self.accelerator)
+    def train(self, dry_run=False):
+        try:
+            self._main_loop(dry_run)
+        except Exception as e:
+            #in multiprocess scenarios it is hard to read the stack trace, so just show one:
+            if self.accelerator.is_main_process:
+                raise e
+        self.accelerator.wait_for_everyone()
+        unwr_coupled_model = self.accelerator.unwrap_model(self.model)
+        return unwr_coupled_model
+    def _prepare_inputs(self, batch, sub_batch_idx, sub_batch_size, proc_batch_size):
+        from_proc_idx = proc_batch_size * self.accelerator.process_index + sub_batch_size * sub_batch_idx
+        to_proc_idx = from_proc_idx + sub_batch_size
+        #log(f"----> DEBUG for sub_b idx {sub_batch_idx}, proc {self.accelerator.process_index}: {from_proc_idx}:{to_proc_idx}")
+        return {k: batch[k][from_proc_idx:to_proc_idx].to(self.accelerator.device) for k in batch}
+    def _get_split_batch_params(self):
+        batch_nr_snts = self.kwargs.batch_size
+        assert batch_nr_snts % self.accelerator.num_processes == 0, "Batch size must be divisible by number of processes."
+        proc_batch_nr_snts = batch_nr_snts // self.accelerator.num_processes
+        sub_batch_size = self.kwargs.nr_sents_per_gpu
+        nr_steps = -(proc_batch_nr_snts // -sub_batch_size)
+        #log(f"--> DEBUG: sub_batch {sub_batch_size} X steps {nr_steps} ~ {proc_batch_nr_snts} ({batch_nr_snts} / {self.accelerator.num_processes})", accelerator=self.accelerator)
+        return sub_batch_size, nr_steps, proc_batch_nr_snts
+    def _report_mem_every_once_in_a_while(self, sub_batch_idx, epoch_batch_idx, batch_dim):
+        if sub_batch_idx == 0:
+            report_devices(f"training memory usage (batch size: {self.kwargs.batch_size} / {batch_dim[1]}",
+                           self.accelerator, self.model)
+    def _main_loop(self, dry_run):
+        if self.accelerator.is_main_process:
+            logger = SameLineLogger(len(self.train_set_iter), self.kwargs.epochs, self.data_state)
+            logger.line_start()
+        else:
+            logger = None
+        self.model.train()
+        self.train_set_iter.thats_where(self.data_state)
+        tks = "full_batch", "prep_inputs", "forward", "backward", "upd_step"
+        tk_batch, tk_prep, tk_fw, tk_bk, tk_step = tks
+        self._add_timekeepers(tks)
+        with self.accelerator.accumulate(self.model):
+            for _epoch_idx in range(self.data_state.epoch_idx, self.kwargs.epochs):
+                for batch, epoch_batch_idx in self.train_set_iter:
+                    if dry_run:
+                        log(f"Dry run, batch width: {batch['input_ids'].size()}")
+                    else:
+                        self._report_mem_every_once_in_a_while(0, epoch_batch_idx, batch['input_ids'].size())
+                        sub_batch_size, nr_steps, proc_batch_size = self._get_split_batch_params()
+                        self._tk_start(tk_batch)
+                        loss = None
+                        for sub_batch_idx in range(nr_steps):
+                            self._tk_start(tk_prep) ########
+                            inputs = self._prepare_inputs(batch, sub_batch_idx, sub_batch_size, proc_batch_size)
+                            inputs['labels'] = inputs['input_ids'].copy()
+                            self._tk_stop(tk_prep) ########
+                            self._tk_start(tk_fw) ########
+                            outputs = self.model(**inputs)
+                            loss = outputs.loss
+                            self._tk_stop(tk_fw) ########
+                            self.train_loss_list.append(loss.item(), sub_batch_idx, epoch_batch_idx, _epoch_idx)
+                            self._tk_start(tk_bk) ########
+                            self.accelerator.backward(loss)
+                            self._tk_stop(tk_bk) ########
+                            self._tk_start(tk_step) ########
+                            self.optimizer.step()
+                            self.lr_scheduler.step()
+                            self.optimizer.zero_grad()
+                            self._tk_stop(tk_step) ########
+                        self._tk_stop(tk_batch)
+                        #assert self.accelerator.sync_gradients, "It is not time to sync gradients yet."
+                        self._step_and_perhaps_save(logger, epoch_batch_idx, _epoch_idx, float(loss.item()))
+        if self.accelerator.is_main_process:
+            logger.line_break()
+    def get_total_grad(self):
+        result = 0
+        grad_count = 0
+        all_count = 0
+        for p in self.model.parameters():
+            if p.grad is not None:
+                result += p.grad.abs().mean().item()
+                grad_count += 1
+            all_count += 1
+        return result/grad_count if grad_count != 0 else -1
+    def _step_and_perhaps_save(self, logger, epoch_batch_idx, epoch_i, loss):
+        epoch_len = len(self.train_set_iter)
+        global_batch_idx = epoch_batch_idx + epoch_i * epoch_len
+        is_end_of_epoch = (epoch_batch_idx == epoch_len)
+        if self.accelerator.is_main_process \
+                and (epoch_batch_idx % self.kwargs.log_steps == 0 or is_end_of_epoch):
+            grad = self.get_total_grad()
+            logger.step(global_batch_idx, epoch_batch_idx, epoch_i, loss, self.lr_scheduler.get_last_lr()[0], grad)
+        #self.optimizer.zero_grad()
+        if (global_batch_idx % self.kwargs.save_steps == 0) or is_end_of_epoch:
+            self.accelerator.wait_for_everyone()
+            if self.accelerator.is_main_process:
+                logger.line_break()
+                log(f"Saving at {epoch_batch_idx} steps, epoch {epoch_i + 1} ({global_batch_idx} global steps)", accelerator=self.accelerator)
+                self._save_all(global_batch_idx, epoch_i)
+                logger.line_start()
+    def _save_all(self, global_batch_idx, epoch_i):
+        epoch_len = len(self.train_set_iter)
+        ckpt_name = (f"checkpoint-e{epoch_i + 1:02}-" +
+                     (f"b{global_batch_idx:07}" if (global_batch_idx % epoch_len) else f"full"))
+        this_location = os.path.join(self.kwargs.save_location, ckpt_name)
+        if os.path.exists(this_location):
+            raise FileExistsError(f"Cannot overwrite existing checkpoint {this_location}!")
+        self.data_state.copy_from(self.train_set_iter.where_are_we(), epoch_idx=epoch_i)
+        model_to_save = self.accelerator.unwrap_model(self.model)
+        save_all_models(this_location, model_to_save, self.tokenizer, trainer=self.accelerator)
+def test_this_damn_thing():
+    # testing
+    import torch
+    import json
+    from torch.optim import AdamW
+    from modelops import hf_tok
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    mdl_id = "models/llama3.2-1b"
+    tokenizer = AutoTokenizer.from_pretrained(mdl_id, token=hf_tok)
+    model = AutoModelForCausalLM.from_pretrained(mdl_id, token=hf_tok, torch_dtype=torch.bfloat16)
+    with open("tmpx.json", "r") as f:
+        training_data_raw = json.load(f)
+    optimizer = AdamW(model.parameters(), lr=5e-6)
+    print("Initial 0:", optimizer.param_groups[0]['lr'])  # Should be [5e-6]
+    scheduler = get_scheduler(
+        "linear",
+        optimizer=optimizer,
+        num_warmup_steps=0,
+        num_training_steps=2445
+    )
+    accel = Accelerator()
+    p_optimizer, p_lr_scheduler, p_model = accel.prepare(optimizer, scheduler, model)
+    print("Initial 1:", p_lr_scheduler.get_last_lr())  # Should be [5e-6]
+    """
+    for _ in range(2):
+        optimizer.step()
+        scheduler.step()
+        print("Step:", scheduler.get_last_lr())
+    """
+if __name__ == "__main__":
+    test_this_damn_thing()

kuidastaltsutadalaamat/legacy/accel_backup.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+import os
+import torch
+from accelerate import Accelerator, DistributedDataParallelKwargs
+from transformers import get_scheduler
+from aux import SameLineLogger, log
+from data import DataState
+from langconv import is_dec_only_llm
+from modelops import save_all_models, report_devices
+from translate import encode
+raise NotImplementedError("This is a backup package, do not run or import from it")
+def chain_params(coupling_specs):
+    for spec in coupling_specs:
+        yield from spec.model.parameters()
+class TrainLossList:
+    def __init__(self):
+        self.data = []
+    def append(self, loss_val, src_k, tgt_k):
+        self.data.append((loss_val, src_k, tgt_k))
+    def state_dict(self):
+        return {'data': self.data}
+    def load_state_dict(self, state_dict):
+        self.data = state_dict['data']
+class SwitchingAccelerator:
+    def __init__(self, coupling_specs, train_set, train_kwargs):
+        self.coupling_specs = coupling_specs
+        self.train_set = train_set
+        self.kwargs = train_kwargs
+        self.is_generative = is_dec_only_llm(self.coupling_specs[0].tokenizer)
+        self.train_loss_list = TrainLossList()
+        self.data_state = DataState(epoch_idx=0)
+        self._init_acc_and_stuff()
+    def _init_acc_and_stuff(self):
+        #self.accelerator = Accelerator(gradient_accumulation_steps=self.kwargs.accum_steps, kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)])
+        #self.accelerator = Accelerator(gradient_accumulation_steps=self.kwargs.accum_steps)
+        self.accelerator = Accelerator()
+        epoch_len = len(self.train_set)
+        train_len = epoch_len * self.kwargs.epochs
+        num_warmup = int(train_len * 0.01)
+        log(f"Warmup steps: {num_warmup}, epoch len: {epoch_len}, train len: {train_len}", accelerator=self.accelerator)
+        opt = torch.optim.AdamW(chain_params(self.coupling_specs), lr=self.kwargs.lr)
+        lr_scheduler = get_scheduler("linear", optimizer=opt, num_warmup_steps=num_warmup,
+                                     num_training_steps=train_len * self.accelerator.num_processes)
+        models = [s.model for s in self.coupling_specs]
+        self.optimizer, self.lr_scheduler, *self.models = self.accelerator.prepare(opt, lr_scheduler, *models)
+        self.accelerator.register_for_checkpointing(self.lr_scheduler, self.data_state, self.train_loss_list)
+        if self.kwargs.continue_training:
+            self.accelerator.load_state(self.kwargs.mdl_id)
+            log(f"Reloaded data state: {self.data_state}", accelerator=self.accelerator)
+    def train(self):
+        try:
+            self._main_loop()
+        except Exception as e:
+            #in multi-process scenarios it is hard to read the stack trace, so just show one:
+            if self.accelerator.is_main_process:
+                raise e
+        self.accelerator.wait_for_everyone()
+        unwr_coupled_model = self.accelerator.unwrap_model(self.models[0])
+        return unwr_coupled_model, self.train_loss_list
+    def _split_batch_and_bin_idxs(self, batch_with_idxs):
+        if self.is_generative:
+            batch, _ = batch_with_idxs
+            src_k = 0
+            tgt_k = 0
+        else:
+            batch, src_k, tgt_k, _ = batch_with_idxs
+        return batch, src_k, tgt_k
+    def _prepare_inputs(self, batch, sub_batch_idx, sub_batch_size, proc_batch_size):
+        from_proc_idx = proc_batch_size * self.accelerator.process_index + sub_batch_size * sub_batch_idx
+        to_proc_idx = from_proc_idx + sub_batch_size
+        #log(f"----> DEBUG for sub_b idx {sub_batch_idx}, proc {self.accelerator.process_index}: {from_proc_idx}:{to_proc_idx}")
+        return {k: batch[k][from_proc_idx:to_proc_idx].to(self.accelerator.device) for k in batch}
+    def _get_split_batch_params(self, batch):
+        batch_nr_snts = batch['input_ids'].size()[0]
+        snt_nr_words = batch['input_ids'].size()[1]
+        assert batch_nr_snts % self.accelerator.num_processes == 0, "Batch size must be divisible by number of processes."
+        proc_batch_nr_snts = batch_nr_snts // self.accelerator.num_processes
+        if self.kwargs.nr_snts_in_batch > 0:
+            sub_batch_size = self.kwargs.nr_snts_in_batch
+        else:
+            sub_batch_size = max(1, self.kwargs.nr_words_in_batch // snt_nr_words)
+        #log(f"DEBUG: #words/snt {snt_nr_words} X #snt in sub batch {sub_batch_size} = {snt_nr_words*sub_batch_size} ~ {self.kwargs.nr_words_in_batch}", accelerator=self.accelerator)
+        nr_steps = -(proc_batch_nr_snts // -sub_batch_size)
+        #log(f"--> DEBUG: sub_batch {sub_batch_size} X steps {nr_steps} ~ {proc_batch_nr_snts} ({batch_nr_snts} / {self.accelerator.num_processes})", accelerator=self.accelerator)
+        return sub_batch_size, nr_steps, proc_batch_nr_snts
+    def _main_loop(self):
+        #countdown_till_do_it_once = 0
+        if self.accelerator.is_main_process:
+            logger = SameLineLogger(len(self.train_set), self.kwargs.epochs)
+            logger.line_start()
+        else:
+            logger = None
+        self.models[0].train()
+        self.train_set.thats_where(self.data_state)
+        for _epoch_idx in range(self.data_state.epoch_idx, self.kwargs.epochs):
+            for batch_with_bin_idxs, epoch_batch_idx in self.train_set:
+                batch, src_k, tgt_k = self._split_batch_and_bin_idxs(batch_with_bin_idxs)
+                sub_batch_size, nr_steps, proc_batch_size = self._get_split_batch_params(batch)
+                loss = None
+                for sub_batch_idx in range(nr_steps):
+                    inputs = self._prepare_inputs(batch, sub_batch_idx, sub_batch_size, proc_batch_size)
+                    if self.is_generative:
+                        inputs['labels'] = inputs['input_ids']
+                        outputs = self.models[0](**inputs)
+                    else:
+                        encoder_vecs = encode(self.models[src_k], inputs)
+                        outputs = self.models[tgt_k](attention_mask=inputs['attention_mask'], labels=inputs['labels'], encoder_outputs=encoder_vecs)
+                    loss = outputs.loss
+                    #if countdown_till_do_it_once > 0:
+                    #    countdown_till_do_it_once -= 1
+                    #elif countdown_till_do_it_once == 0:
+                    if sub_batch_idx == 5:
+                        batch_size = sum([inputs[k].size()[0] * inputs[k].size()[1] for k in 'input_ids labels attention_mask'.split(' ')])
+                        report_devices(f"training memory usage (batch size: {batch_size}; inputs:" +
+                                       f"snts {inputs['input_ids'].size()[0]} X words {inputs['input_ids'].size()[1]})",
+                                       self.accelerator, self.models[0])
+                        countdown_till_do_it_once = 0
+                    self.train_loss_list.append(loss.item(), src_k, tgt_k)
+                    self.accelerator.backward(loss)
+                    for k in inputs:
+                        inputs[k] = inputs[k].to('cpu')
+                self._step_and_perhaps_save(logger, epoch_batch_idx, _epoch_idx, float(loss.item()))
+        if self.accelerator.is_main_process:
+            logger.line_break()
+    def get_total_grad(self):
+        result = 0
+        grad_count = 0
+        all_count = 0
+        for p in self.models[0].parameters():
+            if p.grad is not None:
+                result += p.grad.abs().mean().item()
+                grad_count += 1
+            all_count += 1
+        return result/grad_count if grad_count > 0 else -1
+    def _step_and_perhaps_save(self, logger, epoch_batch_idx, epoch_i, loss):
+        epoch_len = len(self.train_set)
+        global_batch_idx = epoch_batch_idx + epoch_i * epoch_len
+        self.optimizer.step()
+        self.lr_scheduler.step()
+        self.accelerator.wait_for_everyone()
+        is_end_of_epoch = (epoch_batch_idx == epoch_len)
+        if self.accelerator.is_main_process and (epoch_batch_idx % self.kwargs.log_steps == 0 or is_end_of_epoch):
+            grad = self.get_total_grad()
+            logger.step(global_batch_idx, epoch_batch_idx, epoch_i, loss, self.lr_scheduler.get_last_lr()[0], grad)
+        self.optimizer.zero_grad()
+        if (global_batch_idx % self.kwargs.save_steps == 0) or is_end_of_epoch:
+            self.accelerator.wait_for_everyone()
+            if self.accelerator.is_main_process:
+                logger.line_break()
+                log(f"Saving at {epoch_batch_idx} steps, epoch {epoch_i + 1} ({global_batch_idx} global steps)", accelerator=self.accelerator)
+                self._save_all(global_batch_idx, epoch_i)
+                logger.line_start()
+    def _save_all(self, global_batch_idx, epoch_i):
+        epoch_len = len(self.train_set)
+        ckpt_name = (f"checkpoint-e{epoch_i + 1:02}-" +
+                     (f"b{global_batch_idx:07}" if (global_batch_idx % epoch_len) else f"full"))
+        this_location = os.path.join(self.kwargs.save_location, ckpt_name)
+        if os.path.exists(this_location):
+            raise FileExistsError(f"Cannot overwrite existing checkpoint {this_location}!")
+        self.data_state.copy_from(self.train_set.where_are_we(), epoch_idx=epoch_i)
+        model_to_save = self.accelerator.unwrap_model(self.models[0])
+        save_all_models(this_location, model_to_save, self.coupling_specs[0].tokenizer,
+                        self.coupling_specs, trainer=self.accelerator)
+"""

kuidastaltsutadalaamat/legacy/benchmark.py ADDED Viewed

	@@ -0,0 +1,190 @@

+#!/usr/bin/env python3
+import sys
+import os
+import json
+from collections import defaultdict
+from data import split_by_lang, make_path_compatible, get_tr_pairs
+from inference import coupled_translate, load_and_init_module_config, neurotolge_in_batches
+from evaluate import load as load_metric
+from legacy.langconv import get_mdl_type, get_joshi_class
+from accelerate import Accelerator
+from aux import log
+def get_hyp_cache_dir(model_location, create=False):
+    hyp_location = os.path.join(model_location, "hyp_cache")
+    if create:
+        os.makedirs(hyp_location, exist_ok=True)
+    return hyp_location
+def get_hyp_cache_filename(model_location, benchmark_corpus, src_lang, tgt_lang):
+    hyp_location = get_hyp_cache_dir(model_location)
+    corpus_base = os.path.basename(benchmark_corpus)
+    basename = f"{corpus_base}-{src_lang}-to-{tgt_lang}"
+    hyp_file = os.path.join(hyp_location, f"{basename}.hyp")
+    src_file = os.path.join(hyp_location, f"{basename}.src")
+    return hyp_file, src_file
+def get_benchmark_filename(model_location, benchmark_corpus):
+    corpus_base = os.path.basename(benchmark_corpus)
+    hyp_file = f"{corpus_base}-scores.json"
+    return os.path.join(model_location, hyp_file)
+def load_hyps_from_file(filename):
+    with open(filename, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f]
+def save_hyps_to_file(hypos, filename):
+    if hypos is not None:
+        with open(filename, "w", encoding="utf-8") as f:
+            for hyp in hypos:
+                f.write(hyp + "\n")
+def load_or_translate(mod_config, input_output_list, lp, model_location, benchmark_corpus):
+    src_lang, tgt_lang = lp.split("-")
+    inputs, _ = zip(*input_output_list)
+    cache_filename, src_filename = get_hyp_cache_filename(model_location, benchmark_corpus, src_lang, tgt_lang)
+    try:
+        hypos = load_hyps_from_file(cache_filename)
+    except FileNotFoundError:
+        if model_location == "models/neurotolge":
+            hypos = neurotolge_in_batches(inputs, src_lang, tgt_lang)
+        else:
+            hypos = coupled_translate(mod_config, inputs, src_lang, tgt_lang)
+        if hypos is not None:
+            save_hyps_to_file(hypos, cache_filename)
+            save_hyps_to_file(inputs, src_filename)
+    return zip(inputs, hypos)
+def translate_all_hyps(lp_test_set_dict, module_conf, model_id, corpus_id, accelerator=None):
+    if accelerator is not None:
+        key_list = sorted(lp_test_set_dict.keys())
+        for idx, lp in enumerate(key_list):
+            if idx % accelerator.num_processes == accelerator.process_index:
+                log(f"Process {accelerator.process_index} translating {lp}")
+                load_or_translate(module_conf, lp_test_set_dict[lp], lp, model_id, corpus_id)
+        accelerator.wait_for_everyone()
+    else:
+        result = dict()
+        for i, lp in enumerate(lp_test_set_dict.keys()):
+            log(f"Translating {lp}, {i + 1}/{len(lp_test_set_dict)}")
+            result[lp] = load_or_translate(module_conf, lp_test_set_dict[lp], lp, model_id, corpus_id)
+        return result
+def get_joshi_lp(from_lang, to_lang):
+    from_joshi = get_joshi_class(from_lang)
+    to_joshi = get_joshi_class(to_lang)
+    return f"{from_joshi}-{to_joshi}"
+def get_all_scores(hyps_dict, lp_test_sets, metric_dict):
+    scores = dict()
+    avgs = defaultdict(list)
+    for lp in lp_test_sets:
+        from_lang, to_lang = lp.split("-")
+        jlp = get_joshi_lp(from_lang, to_lang)
+        _, outputs = zip(*lp_test_sets[lp])
+        preds = None if hyps_dict[lp] is None else [hyp for _, hyp in hyps_dict[lp]]
+        for metric_name in metric_dict:
+            metric_func = metric_dict[metric_name]
+            if preds is not None:
+                metric_value = metric_func.compute(predictions=preds, references=outputs)
+                scores[lp + "-" + metric_name] = metric_value['score']
+                avgs[jlp + "-" + metric_name].append(metric_value['score'])
+    for avg_k in avgs:
+        scores[avg_k] = sum(avgs[avg_k]) / len(avgs[avg_k])
+    return scores
+def save_scores(scores, mdl_id, corpus):
+    filename = get_benchmark_filename(mdl_id, corpus)
+    with open(filename, "w") as ofh:
+        json.dump(scores, ofh, indent=2, sort_keys=True)
+def benchmark_neurotolge(corpus):
+    log("Loading data")
+    lp_test_sets = split_by_lang(filename=corpus, model_type=None)
+    log("Starting benchmarking")
+    _ = get_hyp_cache_dir("models/neurotolge", create=True)
+    hyps_dict = translate_all_hyps(lp_test_sets, None, "models/neurotolge", corpus)
+    log("Loading metrics")
+    exp_id = "neurotõlge---" + make_path_compatible(corpus)
+    metric_dict = {
+        'bleu': load_metric("sacrebleu", experiment_id=exp_id),
+        'chrf': load_metric("chrf", experiment_id=exp_id) }
+    scores = get_all_scores(hyps_dict, lp_test_sets, metric_dict)
+    save_scores(scores, "models/neurotolge", corpus)
+def benchmark_local_model(mdl_id, corpus):
+    accelerator = Accelerator()
+    main_model, module_config = load_and_init_module_config(mdl_id, accelerator)
+    log("Loading data", accelerator=accelerator)
+    lp_test_sets = split_by_lang(filename=corpus, model_type=get_mdl_type(main_model))
+    log("Loading metrics", accelerator=accelerator)
+    exp_id = make_path_compatible(mdl_id) + "---" + make_path_compatible(corpus)
+    metric_dict = {
+        'bleu': load_metric("sacrebleu", experiment_id=exp_id),
+        'chrf': load_metric("chrf", experiment_id=exp_id) }
+    log("Starting benchmarking", accelerator=accelerator)
+    if accelerator.is_main_process:
+        _ = get_hyp_cache_dir(mdl_id, create=True)
+    translate_all_hyps(lp_test_sets, module_config, mdl_id, corpus, accelerator)
+    if accelerator.is_main_process:
+        fin_hyps_dict = translate_all_hyps(lp_test_sets, module_config, mdl_id, corpus)
+        scores = get_all_scores(fin_hyps_dict, lp_test_sets, metric_dict)
+        save_scores(scores, mdl_id, corpus)
+if __name__ == '__main__':
+    mdl_id_param = sys.argv[1]
+    corpus_param = sys.argv[2]
+    if mdl_id_param == "neurotolge":
+        benchmark_neurotolge(corpus_param)
+    else:
+        benchmark_local_model(mdl_id_param, corpus_param)

kuidastaltsutadalaamat/legacy/data.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/env python3
+import json
+import sys
+from torch.utils.data import IterableDataset
+from random import shuffle, randint
+from legacy.tokops import tokenize_batch
+from aux import log
+def prep_llm_input(ljmftpl):
+    raise NotImplementedError
+    #{'task': 'translate' / 'approx-translate' / 'generate',
+    # 'src_segm': src_segm,
+    # 'tgt_segm': tgt_segm,
+    # 'src_lang': src_lang,
+    # 'tgt_lang': tgt_lang}
+    result = f"{ljmftpl['src_segm']}\n=====\nis in {ljmftpl['src_lang']}"
+    if ljmftpl['task'] in {'translate', 'approx-translate'}:
+        result += f"; {ljmftpl['task']} to {ljmftpl['tgt_lang']}:\n{ljmftpl['tgt_segm']}"
+    return result
+def make_path_compatible(filename):
+    return filename.replace("/", "_").replace(":", "-")
+def do_list_in_batches(data, batch_size):
+    i = 0
+    while i < len(data):
+        yield data[i:i + batch_size]
+        i += batch_size
+class DataState:
+    def __init__(self, elem_idx = 0, shard_idx = 0, epoch_idx = None):
+        self.elem_idx = elem_idx
+        self.shard_idx = shard_idx
+        self.epoch_idx = epoch_idx
+    def state_dict(self):
+        return {'elem_idx': self.elem_idx, 'shard_idx': self.shard_idx, 'epoch_idx': self.epoch_idx}
+    def load_state_dict(self, state_dict):
+        self.elem_idx = state_dict['elem_idx']
+        self.shard_idx = state_dict['shard_idx']
+        self.epoch_idx = state_dict['epoch_idx']
+    def copy_from(self, src_ds, epoch_idx = None):
+        self.shard_idx = src_ds.shard_idx
+        self.elem_idx = src_ds.elem_idx
+        if epoch_idx is not None:
+            self.epoch_idx = epoch_idx
+    def __str__(self):
+        return 'DataState(elem_idx={}, shard_idx={}, epoch_idx={})'.format(self.elem_idx, self.shard_idx, self.epoch_idx)
+    def __repr__(self):
+        return self.__str__()
+class BatchingIterator(IterableDataset):
+    def __init__(self, batched_data, batch_size, tokenizer, max_len=8000):
+        assert len(batched_data[0]) == batch_size, "loaded data batch size and specified batch size differ"
+        self.batched_data = batched_data
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.curr_elem_idx = 0
+        self.data_len = len(self.batched_data)
+    def __len__(self):
+        return self.data_len
+    def __iter__(self):
+        #self.curr_elem_idx = 0
+        return self
+    def where_are_we(self):
+        return DataState(shard_idx=0, elem_idx=self.curr_elem_idx)
+    def thats_where(self, data_state):
+        self.curr_elem_idx = data_state.elem_idx
+    def _tokenize(self, prepped_segm_list):
+        #self.tokenizer.pad_token = '<|reserved_special_token_0|>'
+        #tokenized_batch = self.tokenizer(prepped_segm_list, return_tensors="pt", max_length=self.max_len,
+        #                           truncation=True, add_special_tokens=True,
+        #                           padding=True)
+        tokenized_batch = tokenize_batch(self.tokenizer, prepped_segm_list, maxlen=self.max_len)
+        return tokenized_batch, self.curr_elem_idx + 1
+    def __next__(self):
+        if self.curr_elem_idx >= self.data_len:
+            self.curr_elem_idx = 0
+            raise StopIteration
+        else:
+            batch = self._tokenize(self.batched_data[self.curr_elem_idx])
+            self.curr_elem_idx += 1
+            return batch
+def shuffle_data():
+    # open a list of tuples, save a list of batches of strings made of these tuples
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+    try:
+        batch_size = int(sys.argv[3])
+    except IndexError:
+        batch_size = None
+    log("Reading data")
+    # read the tuples
+    with open(input_file, "r") as f:
+        #raw_data = json.load(f)
+        final_data = json.load(f)
+    log("Making strings")
+    # make strings out of tuples
+    unsorted_data_in_elems = [prep_llm_input(s) for s in raw_data]
+    if batch_size is None:
+        final_data = unsorted_data_in_elems
+    else:
+        # if last batch is undersized, get some random elements to compensate
+        while len(unsorted_data_in_elems) % batch_size != 0:
+            new_elem_idx = randint(0, len(unsorted_data_in_elems) - 1)
+            unsorted_data_in_elems.append(unsorted_data_in_elems[new_elem_idx])
+        log("Sorting and grouping")
+        # sort by length
+        sorted_data_in_elems = sorted(unsorted_data_in_elems, key=lambda x: len(x), reverse=True)
+        # group into batches
+        final_data = list(do_list_in_batches(sorted_data_in_elems, batch_size))
+    log("Shuffling")
+    # shuffle the batches / sentences
+    shuffle(final_data)
+    log("Saving")
+    # save the result
+    with open(output_file, "w") as f:
+        json.dump(final_data, f)
+if __name__ == '__main__':
+    all_data = []
+    for input_file in sys.argv[1:]:
+        with open(input_file, "r") as f:
+            this_data = json.load(f)
+            all_data += this_data
+    shuffle(all_data)
+    json.dump(all_data, sys.stdout)

kuidastaltsutadalaamat/legacy/data_backup.py ADDED Viewed

	@@ -0,0 +1,804 @@

+#!/usr/bin/env python3
+import json
+#import os
+import sys
+import torch
+#import re
+import math
+from torch.utils.data import IterableDataset
+from collections import namedtuple, defaultdict
+from random import randrange, shuffle, randint
+#from pathlib import Path
+#from aux import log
+#from langconv import any_to_madlad, any_to_nllb, is_nllb, is_madlad, get_mdl_type, any_to_mdl_type, is_dec_only_llm, \
+#    base_to_nllb
+#from tokops import tokenizeit
+# TrPair = namedtuple('TrPair', ["src_lang", "tgt_lang", "input", "output"])
+"""
+def prep_llm_input(ljmftpl):
+    #{'task': 'translate' / 'approx-translate' / 'generate',
+    # 'src_segm': src_segm,
+    # 'tgt_segm': tgt_segm,
+    # 'src_lang': src_lang,
+    # 'tgt_lang': tgt_lang}
+    # it's a tuple
+    if "src_segm" in ljmftpl and "task" in ljmftpl:
+        if ljmftpl['task'] in {'translate', 'approx-translate'}:
+            return (f"{ljmftpl['src_segm']}\n=====\n{ljmftpl['task']} from {ljmftpl['src_lang']}; " +
+                    f"to {ljmftpl['tgt_lang']}:\n{ljmftpl['tgt_segm']}")
+        elif ljmftpl['task'] == 'generate':
+            return f"{ljmftpl['src_segm']}\n=====\nis in {ljmftpl['src_lang']};"
+    # it's a string
+    else:
+        return ljmftpl
+def make_path_compatible(filename):
+    return filename.replace("/", "_").replace(":", "-")
+def do_list_in_batches(data, batch_size):
+    i = 0
+    while i < len(data):
+        yield data[i:i + batch_size]
+        i += batch_size
+"""
+"""
+def do_bins_in_batches(bins, batch_size, sort_by_length):
+    result_list = []
+    for src_k in bins:
+        for tgt_k in bins[src_k]:
+            if src_k == 0 or tgt_k == 0:
+                result_list += [(e, src_k, tgt_k) for e in do_list_in_batches(bins[src_k][tgt_k], batch_size)]
+    shuffle(result_list)
+    return result_list
+def _post_proc(text, lang):
+    if lang == 'liv' and "’" in text and "O’R" not in text:
+        return text.replace("’", "")
+    else:
+        return text
+def clean_entry(entry, leave_out):
+    result = {k: _post_proc(entry[k], k) for k in entry if entry[k].strip() and k not in leave_out}
+    return result
+def load_json_data(path, leave_out={}, skip_cats=True, load_mono=True):
+    with open(path, 'r') as f:
+        data = json.load(f)
+        if skip_cats:
+            # skip categories
+            resx = [clean_entry(entry, leave_out)
+                    for cat in data for entry in cat['sentences']]
+            res = [e for e in resx if e]
+        else:
+            raise NotImplementedError
+            # resx = {cat['source']: [clean_entry(entry, leave_out) for entry in cat['sentences']] for cat in data}
+            # res = {k: resx[k] for k in resx if resx[k]}
+        return res
+def get_tr_pairs(raw_data=None, filename=None, leave_out=None, leave_only=None, model_type=None, exclude_set=None):
+    if filename is not None:
+        raw_data = load_json_data(filename)
+    if raw_data is None:
+        raise ValueError("Neither file nor data are provided")
+    i = 0
+    log("Loading data")
+    for tup in raw_data:
+        for l1 in tup:
+            for l2 in tup:
+                if l1 != l2 and not "dia" in l1 and not "dia" in l2:
+                    if leave_out is None or f"{l1}-{l2}" not in leave_out:
+                        if leave_only is None or f"{l1}-{l2}" in leave_only:
+                            i += 1
+                            if not i % 1000000:
+                                log(f"Loaded {i/1000000}M pairs")
+                            dia_key = f"{l2}-dia"
+                            if exclude_set is None or (tup[l1] not in exclude_set[l1] and tup[l2] not in exclude_set[l2]):
+                                input = tup[l1]
+                                if dia_key in tup:
+                                    input = f"<{tup[dia_key]}> {input}"
+                                conv_l1 = any_to_mdl_type(model_type, l1)
+                                conv_l2 = any_to_mdl_type(model_type, l2)
+                                if not snt_is_fishy(input, conv_l1) and not snt_is_fishy(tup[l2], conv_l2):
+                                    yield TrPair(conv_l1, conv_l2, input, tup[l2])
+def split_by_lang(filename, model_type):
+    result = defaultdict(list)
+    # if filename is not None:
+        # tr_pairs = load_json_datax(filename)
+    tr_pairs = get_tr_pairs(filename=filename, model_type=model_type)
+    for tup in tr_pairs:
+        #for l1 in tup:
+        #    for l2 in tup:
+        #        if l1 != l2 and not "dia" in l1 and not "dia" in l2:
+        l1 = tup.src_lang
+        l2 = tup.tgt_lang
+        lp = f"{l1}-{l2}"
+        result[lp].append((tup.input, tup.output))
+    return result
+def data_iter_for_tok_train(raw_data, langs_to_include):
+    for tup in raw_data:
+        for lang in tup:
+            if lang in langs_to_include:
+                yield tup[lang]
+def lang_bin_mapping(coupling_specs):
+    lang_to_idx = dict()
+    for i, spec_pair in enumerate(coupling_specs):
+        for lang in spec_pair.lang_set:
+            if lang not in lang_to_idx:
+                lang_to_idx[lang] = {i}
+            else:
+                lang_to_idx[lang].add(i)
+    return lang_to_idx
+def mix_and_sample_idxs_carefully(src_idxs, tgt_idxs):
+    idx_pairs = [(s, t) for s in src_idxs for t in tgt_idxs if not (s == 1 and t == 1)]
+    if len(idx_pairs) == 0:
+        result = (None, None)
+    else:
+        pair_idx = randrange(len(idx_pairs))
+        result = idx_pairs[pair_idx]
+    # debug(f"src lang: {tr_pair.src_lang}, tgt_lang: {tr_pair.tgt_lang}, idx list: {idx_pairs}, result: {result}")
+    return result
+def inject_bin_indices(batch, src_k, tgt_k):
+    batch['input_ids'][0,0] += src_k << 30
+    batch['labels'][0,0] += tgt_k << 30
+def get_data_cache_location(cache_meta_path, idx):
+    cache_folder, cache_file = os.path.split(cache_meta_path)
+    if cache_folder:
+        Path(cache_folder).mkdir(parents=True, exist_ok=True)
+    if cache_meta_path.endswith(".json"):
+        return cache_meta_path[:-5] + f"_{idx:04}.pt"
+    else:
+        raise ValueError(f"Expected a json file for the cache meta-location ({cache_meta_path})")
+def make_gen_text(src_lang, tgt_lang, input_text, output_text=None, tok=None):
+    if input_text.startswith("<"):
+        posit = input_text.find(">") + 1
+        dialect = input_text[1:posit-1]
+        diatxt = f", variety: {dialect}"
+        txt = input_text[posit+1:]
+    else:
+        dialect = None
+        diatxt = ""
+        txt = input_text
+    return (f"Translate:\n== From: {src_lang}\n== To: {tgt_lang}{diatxt}\n== Input: {txt}\n== Output: " +
+            ("" if (output_text is None or tok is None) else f"{output_text}{tok.eos_token}"))
+class MultilingualBatchingCachingDataset:
+    def _post_proc_bins(self, bins):
+        for src_k in bins:
+            for tgt_k in bins[src_k]:
+                while len(bins[src_k][tgt_k]) % self.args.batch_size != 0:
+                    rnd_elem_idx = randrange(len(bins[src_k][tgt_k]))
+                    rnd_elem = bins[src_k][tgt_k][rnd_elem_idx]
+                    bins[src_k][tgt_k].append(rnd_elem)
+                if self.args.sort_by_len:
+                    bins[src_k][tgt_k] = sorted(bins[src_k][tgt_k], key=lambda e: len(e.input))
+                else:
+                    shuffle(bins[src_k][tgt_k])
+        return bins
+    def _get_idxs(self, tr_pair):
+        src_idxs = self._lang_to_idx[tr_pair.src_lang]
+        tgt_idxs = self._lang_to_idx[tr_pair.tgt_lang]
+        return mix_and_sample_idxs_carefully(src_idxs, tgt_idxs)
+    def _fill_bins(self):
+        bins = defaultdict(lambda: defaultdict(list))
+        for tr_pair in get_tr_pairs(filename=self.filename, model_type=self.model_type, exclude_set=self.exclude_set):
+            src_bin_idx, tgt_bin_idx = self._get_idxs(tr_pair)
+            if src_bin_idx is not None and tgt_bin_idx is not None:
+                bins[src_bin_idx][tgt_bin_idx].append(tr_pair)
+        return self._post_proc_bins(bins)
+    def report_update_stats(self, bins):
+        total = 0
+        totalx = 0
+        updates = 0
+        duds = 0
+        enc_count = 0
+        dec_count = 0
+        for src_k in bins:
+            for tgt_k in bins[src_k]:
+                l = len(bins[src_k][tgt_k])
+                total += l
+                if src_k == 0 or tgt_k == 0:
+                    totalx += l
+                updates += l * (1 - (src_k + tgt_k) / 2)
+                enc_count += l * (1 - src_k)
+                dec_count += l * (1 - tgt_k)
+                if src_k == 1 and tgt_k == 1:
+                    duds += 1
+        # log(str(self._lang_to_idx))
+        log(f"### Ratio of coupled model updates: {100 * updates / total:.2f}% ({100 * updates / totalx:.2f}%); " + \
+            f"frozen meaningless updates: {100 * duds / total:.2f}%; " + \
+            f"enc samples: {enc_count}, dec samples: {dec_count}")
+    def tokenize_input(self, cplspec, input_list, rawbatch):
+        src_tokenizer = cplspec.tokenizer
+        src_tokenizer.src_lang = rawbatch[0].src_lang
+        #prep_batch_grouped = src_tokenizer(text=input_list, return_tensors="pt",
+        #                                   padding="longest", truncation=True, max_length=self.args.max_snt_len)
+        prep_batch_grouped = tokenizeit((src_tokenizer, cplspec.postokenizer), input_list, self.args.max_snt_len, False)
+        if is_nllb(src_tokenizer):
+            src_lang_list = [any_to_nllb(e.src_lang) for e in rawbatch]
+            src_lang_vec = src_tokenizer.convert_tokens_to_ids(src_lang_list)
+            prep_batch_grouped['input_ids'][:,0] = torch.tensor(src_lang_vec)
+        return prep_batch_grouped
+    def tokenize_output(self, tgttokenizer, tgtposttok, rawbatch):
+        outputs = [e.output for e in rawbatch]
+        tgttokenizer.tgt_lang = rawbatch[0].tgt_lang
+        #labels = tgttokenizer(text_target=outputs, return_tensors="pt",
+        #                      padding="longest", truncation=True, max_length=self.args.max_snt_len)
+        labels = tokenizeit((tgttokenizer, tgtposttok), outputs, self.args.max_snt_len, True)
+        if is_nllb(tgttokenizer):
+            tgt_lang_list = [any_to_nllb(e.tgt_lang) for e in rawbatch]
+            tgt_lang_vec = tgttokenizer.convert_tokens_to_ids(tgt_lang_list)
+            labels['input_ids'][:, 0] = torch.tensor(tgt_lang_vec)
+        return labels
+    def tokenize_gen_batch(self, raw_batch):
+        tokenizer = self.coupling_specs[0].tokenizer
+        tokenizer.pad_token = '<|reserved_special_token_0|>'
+        tokenizer.padding_side = 'left'
+        texts = [make_gen_text(e.src_lang, e.tgt_lang, e.input, e.output, tokenizer) for e in raw_batch]
+        #batch = tokenizer(texts, return_tensors="pt", max_length=512, truncation=True, add_special_tokens=True, padding=True)
+        batch = tokenizeit((tokenizer, self.coupling_specs[0].postokenizer), texts, self.args.max_snt_len, False)
+        return batch
+    def tokenize_and_pad(self, raw_batch, src_k, tgt_k):
+        tgt_tokenizer = self.coupling_specs[tgt_k].tokenizer
+        tgt_postok = self.coupling_specs[tgt_k].postokenizer
+        if is_madlad(tgt_tokenizer):
+            inputs = [f"{any_to_madlad(e.tgt_lang)} {e.input}" for e in raw_batch]
+        else:
+            inputs = [e.input for e in raw_batch]
+        prep_batch_grouped = self.tokenize_input(self.coupling_specs[src_k], inputs, raw_batch)
+        labels = self.tokenize_output(tgt_tokenizer, tgt_postok, raw_batch)
+        prep_batch_grouped['labels'] = labels['input_ids']
+        # inject_bin_indices(prep_batch_grouped, src_k, tgt_k)
+        #split_prep_batch = [{k: prep_batch_grouped[k][i] for k in prep_batch_grouped}
+        #                    for i, trp in enumerate(raw_batch)]
+        return prep_batch_grouped
+    def _bins_to_tokenized_batched_cached_data(self, bins, cache_path):
+        shard_i = 0
+        batch_i = 0
+        total_i = 0
+        metainfo = []
+        data = []
+        log("Tokenizing data")
+        for raw_batch, src_k, tgt_k in do_bins_in_batches(bins, self.args.batch_size, self.args.sort_by_len):
+            batch_i += 1
+            if not batch_i % 10000:
+                log(f"Tokenized {batch_i + shard_i * self.args.shard_size} batches (shard {shard_i})")
+            if is_dec_only_llm(self.coupling_specs[tgt_k].tokenizer):
+                prepared_batch = self.tokenize_gen_batch(raw_batch)
+                data.append((prepared_batch, total_i))
+            else:
+                prepared_batch = self.tokenize_and_pad(raw_batch, src_k, tgt_k)
+                data.append((prepared_batch, src_k, tgt_k, total_i))
+            if batch_i >= self.args.shard_size:
+                shard_i += 1
+                batch_i = 0
+                fn = self._save_cache_file(data, cache_path, shard_i)
+                metainfo.append({'shard_filename': fn, 'shard_size': len(data)})
+                del data
+                data = []
+            total_i += 1
+        if len(data) > 0:
+            fn = self._save_cache_file(data, cache_path, shard_i + 1)
+            metainfo.append({'shard_filename': fn, 'shard_size': len(data)})
+        with open(cache_path, 'w') as f:
+            json.dump(metainfo, f)
+        del data
+    @staticmethod
+    def _save_cache_file(data, cache_location, idx):
+        cache_location = get_data_cache_location(cache_location, idx)
+        if os.path.exists(cache_location):
+            raise Exception("Cache already exists")
+        torch.save(data, cache_location)
+        log(f"Saved data into cache (shard {idx})")
+        return cache_location
+    def set_model_type(self):
+        result = None
+        for spec_tuple in self.coupling_specs:
+            this_type = get_mdl_type(spec_tuple.tokenizer)
+            if result is None:
+                result = this_type
+            else:
+                assert result == this_type, "in this implementation model types (NLLB/MADLAD/...) must be the same for all included models"
+        return result
+    def __init__(self, tr_file, coupling_specs, args):
+        self.args = args
+        self.filename = tr_file
+        self.coupling_specs = coupling_specs
+        self.exclude_set = _dev_to_dict(args.exclude_set) if args.exclude_set is not None else None
+        self.model_type = self.set_model_type()
+        # init lang to idx
+        self._lang_to_idx = lang_bin_mapping(coupling_specs)
+    def load_and_cache_data(self, cache_path):
+        # collect data into bins and cache it
+        bins = self._fill_bins()
+        self.report_update_stats(bins)
+        self._bins_to_tokenized_batched_cached_data(bins, cache_path)
+"""
+"""
+class DataState:
+    def __init__(self, elem_idx = 0, shard_idx = 0, epoch_idx = None):
+        self.elem_idx = elem_idx
+        self.shard_idx = shard_idx
+        self.epoch_idx = epoch_idx
+    def state_dict(self):
+        return {'elem_idx': self.elem_idx, 'shard_idx': self.shard_idx, 'epoch_idx': self.epoch_idx}
+    def load_state_dict(self, state_dict):
+        self.elem_idx = state_dict['elem_idx']
+        self.shard_idx = state_dict['shard_idx']
+        self.epoch_idx = state_dict['epoch_idx']
+    def copy_from(self, src_ds, epoch_idx = None):
+        self.shard_idx = src_ds.shard_idx
+        self.elem_idx = src_ds.elem_idx
+        if epoch_idx is not None:
+            self.epoch_idx = epoch_idx
+    def __str__(self):
+        return 'DataState(elem_idx={}, shard_idx={}, epoch_idx={})'.format(self.elem_idx, self.shard_idx, self.epoch_idx)
+    def __repr__(self):
+        return self.__str__()
+class BatchingIterator(IterableDataset):
+    def __init__(self, segment_list, batch_size, tokenizer, max_len=8000):
+        self.data = segment_list
+        shuffle(self.data)
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.curr_elem_idx = 0
+        self.data_len = math.ceil(len(self.data) / self.batch_size)
+    def __len__(self):
+        return self.data_len
+    def __iter__(self):
+        self.curr_elem_idx = 0
+        return self
+    def where_are_we(self):
+        return DataState(shard_idx=0, elem_idx=self.curr_elem_idx)
+    def thats_where(self, data_state):
+        self.curr_elem_idx = data_state.elem_idx
+    def _get_properly_sized_segment_list(self):
+        i = self.curr_elem_idx * self.batch_size
+        segment_list = self.data[i:i + self.batch_size]
+        if len(segment_list) < self.batch_size:
+            orig_len = len(segment_list)
+            while len(segment_list) < self.batch_size:
+                segment_list.append(segment_list[randint(0, orig_len - 1)])
+        return segment_list
+    def _tokenize(self, segment_list):
+        #{'task': 'translate',
+        # 'src_segm': src_segm,
+        # 'tgt_segm': tgt_segm,
+        # 'src_lang': src_lang,
+        # 'tgt_lang': tgt_lang}
+        prepped_segm_list = [prep_llm_input(s) for s in segment_list]
+        self.tokenizer.pad_token = '<|reserved_special_token_0|>'
+        tokenized_batch = self.tokenizer(prepped_segm_list, return_tensors="pt", max_length=self.max_len,
+                                   truncation=True, add_special_tokens=True,
+                                   padding=True)
+        return tokenized_batch, self.curr_elem_idx + 1
+    def __next__(self):
+        if self.curr_elem_idx >= self.data_len:
+            raise StopIteration
+        else:
+            segment_list = self._get_properly_sized_segment_list()
+            batch = self._tokenize(segment_list)
+            self.curr_elem_idx += 1
+            return batch
+"""
+"""
+class MultilingualDatasetIterator(IterableDataset):
+    def _load_metafile(self, cache_metafile):
+        with open(cache_metafile, 'r') as f:
+            self.metainfo = json.load(f)
+            self.data_len = sum([e['shard_size'] for e in self.metainfo])
+    def _init_curr_shard(self):
+        cache_location = self.metainfo[self.curr_shard_idx]['shard_filename']
+        self.curr_shard_data = torch.load(cache_location, weights_only=False)
+        assert len(self.curr_shard_data) == self.metainfo[self.curr_shard_idx]['shard_size']
+    def __init__(self, filename):
+        self.curr_shard_idx = 0
+        self.curr_elem_idx = 0
+        self.prev_shard_sum_len = 0
+        if filename is not None:
+            self._load_metafile(filename)
+    def __iter__(self):
+        self._init_curr_shard()
+        return self
+    def where_are_we(self):
+        return DataState(shard_idx=self.curr_shard_idx, elem_idx=self.curr_elem_idx)
+    def thats_where(self, data_state):
+        self.curr_shard_idx = data_state.shard_idx
+        self.curr_elem_idx = data_state.elem_idx
+        self.prev_shard_sum_len = sum([e['shard_size'] for i, e in enumerate(self.metainfo) if i < self.curr_shard_idx])
+    def __next__(self):
+        try:
+            result_data = self.curr_shard_data[self.curr_elem_idx]
+            self.curr_elem_idx += 1
+        except IndexError:
+            self.prev_shard_sum_len += self.metainfo[self.curr_shard_idx]['shard_size']
+            self.curr_shard_idx += 1
+            if self.curr_shard_idx >= len(self.metainfo):
+                self.__init__(None)
+                raise StopIteration
+            else:
+                self._init_curr_shard()
+                self.curr_elem_idx = 0
+                result_data = self.curr_shard_data[self.curr_elem_idx]
+                self.curr_elem_idx += 1
+        index_in_epoch = self.prev_shard_sum_len + self.curr_elem_idx
+        return result_data, index_in_epoch
+    def __len__(self):
+        return self.data_len
+def dump_to_stdout():
+    filename = sys.argv[1]
+    lc_src = defaultdict(int)
+    tot_len = 0
+    tot_count = 0
+    for tr_pair in get_tr_pairs(filename=filename):
+        print(tr_pair.src_lang + "\t" + tr_pair.input + "\t" + tr_pair.tgt_lang + "\t" + tr_pair.output)
+        tot_len += upd_lc(lc_src, tr_pair.src_lang, tr_pair.input)
+        tot_len += upd_lc(lc_src, tr_pair.tgt_lang, tr_pair.output)
+        tot_count += 2
+    totes = sum(lc_src.values())
+    for k in sorted(lc_src):
+        sys.stderr.write(f"{k}: {100*lc_src[k]/totes:.1f}%\n")
+    sys.stderr.write(f"Avg length: {tot_len/float(tot_count):.1f}\n")
+def do_stats(filename):
+    stats = defaultdict(int)
+    raw_data = load_json_data(filename)
+    for data in raw_data:
+        langs = sorted([k for k in data.keys() if data[k].strip() != ""])
+        stats["-".join(langs)] += 1
+    for k in stats:
+        print(k, stats[k])
+def lang_from_name(filename):
+    return filename.split(".")[-1]
+def moses_to_json(file1, file2):
+    result = list()
+    l1 = lang_from_name(file1)
+    l2 = lang_from_name(file2)
+    with open(file1, "r") as h1, open(file2, "r") as h2:
+        for line1 in h1:
+            line2 = h2.readline()
+            result.append({l1: line1.strip(), l2: line2.strip()})
+    return result
+def multi_moses_to_json(output_file, init_json, input_file_tuples):
+    try:
+        with open(init_json, "r") as h:
+            result = json.load(h)
+    except:
+        result = list()
+    for input_file_tuple in input_file_tuples:
+        this_result = moses_to_json(*input_file_tuple)
+        result.append({"source": f"{input_file_tuple[0]}-{input_file_tuple[1]}", "sentences": this_result})
+    with open(output_file, "w") as f:
+        json.dump(result, f, indent=2, sort_keys=True)
+def group_tuples(input_tuples):
+    return [(input_tuples[2 * i], input_tuples[2 * i + 1]) for i in range(int(len(input_tuples) / 2))]
+def combine_two_jsons(json_target, json_addition):
+    for k in json_addition:
+        if k in json_target:
+            json_target[k] += json_addition[k]
+        else:
+            json_target[k] = json_addition[k]
+def combine_jsons(filelist):
+    result = dict()
+    for filename in filelist:
+        data = json.load(open(filename))
+        combine_two_jsons(result, data)
+    json.dumps(result)
+def _dev_to_dict(filename):
+    result = defaultdict(lambda: defaultdict(int))
+    for dev_sample in load_json_data(filename):
+        for lang in dev_sample:
+            if not "dia" in lang:
+                result[lang][dev_sample[lang]] = 1
+    return result
+def check_cross_pollination(small_path, large_path):
+    print("preparing dev set")
+    dct = _dev_to_dict(small_path)
+    print("reading train set")
+    for train_sample in load_json_data(large_path):
+        for lang in train_sample:
+            if not "dia" in lang and lang in dct:
+                snt = train_sample[lang]
+                if snt in dct[lang]:
+                    dct[lang][snt] += 1
+    print("---------------------")
+    print("contamination report:")
+    print("---------------------")
+    for lang in dct:
+        total = 0
+        counts = 0
+        freqs = 0
+        for snt in dct[lang]:
+            total += 1
+            if dct[lang][snt] > 1:
+                counts += 1
+                freqs += (dct[lang][snt] - 1)
+        print(f"{lang}: contaminated: {counts} ({100*counts/float(total):.1f}%), total occurrence: {freqs}")
+def char_class(c):
+    lc = c.lower()
+    if re.match("[a-z]", lc):
+        return "latn"
+    elif re.match("[а-я]", lc):
+        return "cyrl"
+    else:
+        return "other"
+def snt_is_fishy(snt_raw, lang, detailed=False):
+    snt = re.sub(r'^<[^>]+> ', '', snt_raw)
+    snt_db = defaultdict(int)
+    for c in snt:
+        c_c = char_class(c)
+        snt_db[c_c] += 1
+    tot = snt_db['latn'] + snt_db['cyrl']
+    if tot > 0:
+        if snt_db['latn'] / tot > 0.7:
+            this_is = 'latn'
+        elif snt_db['cyrl'] / tot > 0.7:
+            this_is = 'cyrl'
+        else:
+            this_is = 'mix'
+        should_be = any_to_nllb(lang).split("_")[1].lower()
+        if should_be != this_is:
+            return (True, this_is, should_be) if detailed else True
+    return (False, None, None) if detailed else False
+def script_stats():
+    db = defaultdict(lambda: defaultdict(int))
+    # corp = []
+    for raw_line in sys.stdin:
+        lang, snt_raw = raw_line.strip().split("\t")
+        is_fishy, this_is, should_be = snt_is_fishy(snt_raw, lang, detailed=True)
+        if is_fishy:
+            print(f"{lang}: should be {should_be}, is actually {this_is}:\n{snt_raw}")
+def get_full_lang(lang, tupl):
+    dia_key = f"{lang}-dia"
+    if dia_key in tupl:
+        return f"{lang}, {tupl[dia_key]}"
+    else:
+        return lang
+def convert_json_to_json(src_json, dest_json):
+    raw_data = load_json_data(src_json)
+    output_data = []
+    for tupl in raw_data:
+        for l1 in tupl:
+            for l2 in tupl:
+                if l1 != l2 and not "dia" in l1 and not "dia" in l2:
+                    src_segm = tupl[l1]
+                    tgt_segm = tupl[l2]
+                    src_lang = get_full_lang(l1, tupl)
+                    tgt_lang = get_full_lang(l2, tupl)
+                    output_data.append({ 'task': 'translate',
+                                         'src_segm': src_segm,
+                                         'tgt_segm': tgt_segm,
+                                         'src_lang': src_lang,
+                                         'tgt_lang': tgt_lang})
+    with open(dest_json, "w") as f:
+        json.dump(output_data, f, indent=2)
+"""
+if __name__ == "__main__":
+    # check_cross_pollination(sys.argv[1], sys.argv[2])
+    # multi_moses_to_json(sys.argv[1], sys.argv[2], group_tuples(sys.argv[3:]))
+    # combine_jsons(sys.argv[1:])
+    # do_stats("data/train.json")
+    # dump_to_stdout()
+    # script_stats()
+    # convert_json_to_json(sys.argv[1], sys.argv[2])
+    pass

kuidastaltsutadalaamat/legacy/diffmdl.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python3
+import sys
+import torch
+from datetime import datetime
+from transformers import AutoModelForSeq2SeqLM
+def get_mdl_param_dict(mdl):
+    return {n: p for n, p in mdl.named_parameters()}
+def log(msg):
+    sys.stderr.write(str(datetime.now()) + ": " + msg + '\n')
+def _avg_diff(pd1, pd2, skip_emb):
+    result = 0
+    count = 0
+    raw_count = 0
+    for k in pd1.keys():
+        # log(k)
+        if not (skip_emb and "shared" in k):
+            delta = pd1[k] - pd2[k]
+            raw_count += 1
+            if len(delta.shape) == 1:
+                thiscount = delta.shape[0]
+            elif len(delta.shape) == 2:
+                thiscount = delta.shape[0] * delta.shape[1]
+            else:
+                raise Exception("Unexpected shape")
+            count += thiscount
+            deltasum = torch.sum(delta)
+            #log(f"DETDIFF {k}: {deltasum/thiscount}")
+            result += deltasum
+    # print(f"Count {count}, raw count {raw_count}")
+    return result / count
+def avg_mdl_diff(m1, m2, skip_emb=False):
+    pd1 = get_mdl_param_dict(m1)
+    pd2 = get_mdl_param_dict(m2)
+    assert (pd1.keys() == pd2.keys())
+    return _avg_diff(pd1, pd2, skip_emb)
+if __name__ == "__main__":
+    mdl1_id = sys.argv[1]
+    mdl2_id = sys.argv[2]
+    log(f"Load mdl 1: {mdl1_id}")
+    model1 = AutoModelForSeq2SeqLM.from_pretrained(mdl1_id)
+    log(f"Load mdl 2: {mdl2_id}")
+    model2 = AutoModelForSeq2SeqLM.from_pretrained(mdl2_id)
+    log(f"Full diff: {avg_mdl_diff(model1, model2)}")
+    #log(f"Encoder diff: {avg_mdl_diff(model1.get_encoder(), model2.get_encoder(), True)}")
+    #log(f"Decoder diff: {avg_mdl_diff(model1.get_decoder(), model2.get_decoder(), True)}")
+    #log(f"Embedding diff: {avg_mdl_diff(model1.get_input_embeddings(), model2.get_input_embeddings())}")

kuidastaltsutadalaamat/legacy/initmodel.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env python3
+import os
+from transformers import AutoConfig, AutoModelForSeq2SeqLM
+from legacy.modelops import mdl_param_count
+from legacy.tokops import train_or_extend_tokenizer_and_upd_model, save_postokens
+from aux import get_changed_config, CmdlineArgs
+from legacy.langconv import lang_set_maybe_smugri
+def just_do_main_stuff_and_avoid_global_ctx_variables():
+    args = CmdlineArgs("Initialize a new HuggingFace model randomly, off of an existing configuration, with possible changes",
+                       pos_arg_list=["mdl_id", "save_location"],
+                       kw_arg_dict={ k: None for k in ["tok_train_file", "new_langs", "vocab_size", "merge_tokenizers", "merge_tok_mdl_id",
+                                    "tok_mdl_id", "activation_dropout", "activation_function", "d_model",
+                                    "decoder_attention_heads", "decoder_ffn_dim", "decoder_layerdrop", "decoder_layers",
+                                    "encoder_attention_heads", "encoder_ffn_dim", "encoder_layerdrop", "encoder_layers",
+                                    "num_hidden_layers"] })
+    if not args.tok_mdl_id:
+        args.tok_mdl_id = args.mdl_id
+    if args.new_langs:
+        args.new_langs = lang_set_maybe_smugri(args.new_langs)
+    if os.path.exists(args.save_location):
+        raise Exception(f"Save location '{args.save_location}' already exists, don't want to overwrite")
+    config = get_changed_config(AutoConfig.from_pretrained(args.mdl_id), args)
+    model = AutoModelForSeq2SeqLM.from_config(config)
+    tokenizer, added = train_or_extend_tokenizer_and_upd_model(args, model)
+    tokenizer.save_pretrained(args.save_location)
+    save_postokens(added, args.save_location)
+    model.save_pretrained(args.save_location)
+    mdl_size, emb_size = mdl_param_count(model)
+    print(f"Created model with {mdl_size} parameters" +
+          ("" if emb_size < 0 else f" of which {emb_size} ({100 * emb_size / mdl_size:.2f}%) are embeddings"))
+if __name__ == '__main__':
+    just_do_main_stuff_and_avoid_global_ctx_variables()

kuidastaltsutadalaamat/legacy/langconv.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""
+Convert lang. codes between different schemas
+NLLB uses codes like "eng_Latn": ISO-639-3 and script
+MADLAD uses codes like "<2en>": ISO-639-1 for where it's available, ISO-639-3 elsewhere
+(and some codes include the script, but we'll ignore them here)
+Functions at the end of the file (any_to_nllb, any_to_madlad) should
+cope with a lang code in any style ('en', 'eng', 'eng_Latn', '<2en>', '<2eng>', etc)
+and convert them to corresponding representations (NLLB/MADLAD).
+"""
+from collections import defaultdict
+SMUGRI_LOW = "fkv,izh,kca,koi,kpv,krl,liv,lud,mdf,mhr,mns,mrj,myv,olo,sjd,sje,sju,sma,sme,smj,smn,sms,udm,vep,vot,vro"
+SMUGRI_HIGH = "deu,eng,est,fin,hun,lvs,nor,rus,swe"
+SMUGRI = SMUGRI_HIGH + "," + SMUGRI_LOW
+import pycountry
+# madlad all codes
+MADLAD_CODES = ['<2meo>', '<2lo>', '<2Grek>', '<2ada>', '<2ps>', '<2arn>', '<2Armn>', '<2to>', '<2raj>', '<2bas>', '<2ny>', '<2>', '<2zza>', '<2Thai>', '<2kaa_Latn>', '<2yap>', '<2en_xx_simple>', '<2ta>', '<2bg_Latn>', '<2mkn>', '<2lhu>', '<2gu_Latn>', '<2nzi>', '<2uz>', '<2pis>', '<2cfm>', '<2min>', '<2fon>', '<2tn>', '<2msi>', '<2sw>', '<2Tfng>', '<2teo>', '<2taj>', '<2pap>', '<2sd>', '<2Jpan>', '<2tca>', '<2sr>', '<2an>', '<2fr>', '<2gor>', '<2az>', '<2qvi>', '<2pck>', '<2cak>', '<2ltg>', '<2sah>', '<2tly_IR>', '<2ts>', '<2yo>', '<2hne>', '<2bzj>', '<2tuc>', '<2sh>', '<2da>', '<2gui>', '<2translate>', '<2et>', '<2sja>', '<2nhe>', '<2scn>', '<2dje>', '<2pt>', '<2nog>', '<2fil>', '<2mai>', '<2lb>', '<2bm>', '<2Guru>', '<2gom>', '<2hr>', '<2kg>', '<2uk>', '<2rw>', '<2izz>', '<2Telu>', '<2wuu>', '<2Deva>', '<2or>', '<2is>', '<2om>', '<2iso>', '<2sn>', '<2kjh>', '<2tbz>', '<2suz>', '<2bjn>', '<2lv>', '<2mfe>', '<2tcy>', '<2tyz>', '<2ksw>', '<2nds_NL>', '<2ms>', '<2mam>', '<2ubu>', '<2hil>', '<2mh>', '<2gl>', '<2bew>', '<2ilo>', '<2kbd>', '<2toj>', '<2quf>', '<2jam>', '<2Beng>', '<2tyv>', '<2lmo>', '<2ace>', '<2cab>', '<2sq>', '<2ug>', '<2kac>', '<2ay>', '<2mag>', '<2Arab>', '<2mrj>', '<2cs>', '<2bci>', '<2doi>', '<2zu>', '<2ndc_ZW>', '<2smt>', '<2ho>', '<2ss>', '<2he>', '<2twu>', '<2kjg>', '<2pag>', '<2Latn>', '<2gym>', '<2sus>', '<2zh_Latn>', '<2mps>', '<2lg>', '<2ko>', '<2se>', '<2guc>', '<2mr>', '<2mwl>', '<2dwr>', '<2din>', '<2ffm>', '<2maz>', '<2nia>', '<2nl>', '<2Knda>', '<2jv>', '<2noa>', '<2udm>', '<2kr>', '<2de>', '<2ar>', '<2ZW>', '<2dln>', '<2mn>', '<2ml>', '<2crh>', '<2ha>', '<2ks>', '<2qvc>', '<2fur>', '<2myv>', '<2nv>', '<2ak>', '<2Gujr>', '<2cce>', '<2nso>', '<2sg>', '<2rmc>', '<2mas>', '<2mni>', '<2frp>', '<2my>', '<2xal>', '<2th>', '<2bik>', '<2bho>', '<2inb>', '<2Mlym>', '<2oj>', '<2back_translated>', '<2tet>', '<2gsw>', '<2ff>', '<2hy>', '<2otq>', '<2el>', '<2agr>', '<2br>', '<2alt>', '<2tzo>', '<2chm>', '<2transliterate>', '<2hu>', '<2btx>', '<2vi>', '<2iba>', '<2bg>', '<2gub>', '<2li>', '<2ace_Arab>', '<2qub>', '<2ktu>', '<2bru>', '<2bbc>', '<2ca>', '<2hvn>', '<2sat_Latn>', '<2ku>', '<2shn>', '<2djk>', '<2krc>', '<2io>', '<2ig>', '<2chk>', '<2sm>', '<2Mymr>', '<2Kore>', '<2ary>', '<2lu>', '<2fa>', '<2spp>', '<2af>', '<2ti>', '<2Tibt>', '<2emp>', '<2enq>', '<2kl>', '<2be>', '<2srn>', '<2ms_Arab_BN>', '<2kri>', '<2gd>', '<2mk>', '<2syr>', '<2kmz_Latn>', '<2CA>', '<2ium>', '<2abt>', '<2ngu>', '<2tab>', '<2it>', '<2ru>', '<2ann>', '<2msm>', '<2fo>', '<2ne>', '<2akb>', '<2kv>', '<2jac>', '<2ceb>', '<2ang>', '<2tdx>', '<2tr>', '<2kbp>', '<2mgh>', '<2az_RU>', '<2acf>', '<2tg>', '<2dov>', '<2pau>', '<2mg>', '<2fuv>', '<2nn>', '<2Hant>', '<2hui>', '<2ml_Latn>', '<2ja>', '<2lus>', '<2te>', '<2qu>', '<2rom>', '<2tsg>', '<2el_Latn>', '<2cr_Latn>', '<2ur>', '<2fi>', '<2shp>', '<2brx>', '<2laj>', '<2sda>', '<2lij>', '<2st>', '<2bn>', '<2zxx_xx_dtynoise>', '<2yua>', '<2no>', '<2fr_CA>', '<2miq>', '<2trp>', '<2es>', '<2ch>', '<2mass>', '<2os>', '<2bts>', '<2ady>', '<2lrc>', '<2seh>', '<2adh>', '<2new>', '<2mak>', '<2grc>', '<2nus>', '<2tzj>', '<2nut>', '<2gu>', '<2oc>', '<2ppk>', '<2Hans>', '<2tzh>', '<2si>', '<2wo>', '<2nyu>', '<2Hebr>', '<2mad>', '<2tll>', '<2kr_Arab>', '<2pon>', '<2mbt>', '<2kw>', '<2bjn_Arab>', '<2gn>', '<2eu>', '<2dz>', '<2kaa>', '<2crh_Latn>', '<2te_Latn>', '<2ky>', '<2kn_Latn>', '<2kum>', '<2fip>', '<2ksd>', '<2sk>', '<2NL>', '<2ctd_Latn>', '<2Khmr>', '<2gbm>', '<2Cans>', '<2haw>', '<2gag>', '<2Taml>', '<2cnh>', '<2bim>', '<2ms_Arab>', '<2Thaa>', '<2kha>', '<2tvl>', '<2Cyrl>', '<2chr>', '<2dtp>', '<2ba>', '<2nan_Latn_TW>', '<2ro>', '<2ctu>', '<2Ethi>', '<2zh>', '<2ln>', '<2ve>', '<2xh>', '<2skr>', '<2ber>', '<2niq>', '<2ibb>', '<2jvn>', '<2tks>', '<2av>', '<2ahk>', '<2tk>', '<2tt>', '<2ka>', '<2tsc>', '<2km>', '<2co>', '<2id>', '<2prs>', '<2rki>', '<2kmb>', '<2ks_Deva>', '<2ify>', '<2wal>', '<2arz>', '<2amu>', '<2rm>', '<2pa>', '<2RU>', '<2ce>', '<2hi>', '<2eo>', '<2taq>', '<2ga>', '<2qxr>', '<2la>', '<2bi>', '<2rwo>', '<2dyu>', '<2zh_Hant>', '<2mt>', '<2bqc>', '<2bn_Latn>', '<2zne>', '<2szl>', '<2lt>', '<2sl>', '<2hif>', '<2alz>', '<2ber_Latn>', '<2ckb>', '<2wa>', '<2Cher>', '<2msb>', '<2gom_Latn>', '<2ru_Latn>', '<2crs>', '<2kk>', '<2gvl>', '<2qvz>', '<2bar>', '<2qup>', '<2bgp>', '<2bo>', '<2su>', '<2tzm>', '<2IR>', '<2sv>', '<2srm>', '<2rn>', '<2bus>', '<2jiv>', '<2awa>', '<2gv>', '<2knj>', '<2as>', '<2quc>', '<2en>', '<2sa>', '<2bug>', '<2quy>', '<2hi_Latn>', '<2nds>', '<2kek>', '<2mrw>', '<2kos>', '<2cy>', '<2ta_Latn>', '<2kn>', '<2nr>', '<2ape>', '<2bs>', '<2iu>', '<2nnb>', '<2Geor>', '<2rcf>', '<2meu>', '<2cac>', '<2cuk>', '<2bua>', '<2vec>', '<2so>', '<2fj>', '<2gof>', '<2koi>', '<2cv>', '<2guh>', '<2war>', '<2pl>', '<2cbk>', '<2kj>', '<2dv>', '<2mdf>', '<2fy>', '<2am>', '<2sc>', '<2taq_Tfng>', '<2mi>', '<2zap>', '<2mqy>', '<2yi>', '<2kwi>', '<2hmn>', '<2tiv>', '<2sxn>', '<2hus>', '<2ban>', '<2nij>', '<2tlh>', '<2Orya>', '<2quh>', '<2ee>', '<2ht>', '<2bum>', '<2stq>']
+# NLLB all codes
+NLLB_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn']
+MDL_NLLB = "MDL_NLLB"
+MDL_MADLAD = "MDL_MADLAD"
+MDL_NEUROTOLGE = "MDL_NEUROTÕLGE"
+MDL_LLAMA = "MDL_LLAMA"
+_iso3_to_script = dict([nllb_code.split("_") for nllb_code in NLLB_CODES])
+iso3_to_nllb = { code: f"{code}_{_iso3_to_script[code]}" for code in _iso3_to_script }
+iso3_to_nllb['lav'] = "lvs_Latn"
+iso3_to_nllb['nor'] = "nob_Latn"
+iso3_to_nllb['yid'] = "ydd_Hebr"
+for lang in "fkv izh krl liv lud olo sje sju sma sme smj smn sms vep vot vro".split():
+    iso3_to_nllb[lang] = f"{lang}_Latn"
+for lang in "kca koi kpv mdf mhr mns mrj myv sjd udm".split():
+    iso3_to_nllb[lang] = f"{lang}_Cyrl"
+_rev_joshi = defaultdict(lambda: "?")
+for k in "krl,sma,vep,smj,smn,lud,liv,izh,vot,kca,sms,sje,mns,fkv,sju,sjd".split(","):
+    _rev_joshi[k] = "0"
+for k in "kpv,sme,mhr,udm,olo,myv,mdf,vro,mrj,koi".split(","):
+    _rev_joshi[k] = "1"
+for k in SMUGRI_HIGH.split(","):
+    _rev_joshi[k] = "2+"
+def guess_script(lang):
+    return "Unk"
+def get_high_set():
+    return set(SMUGRI_HIGH.split(",")) - {"deu", "swe"}
+def clean_lang(raw_lang):
+    if "<2" in raw_lang:
+        raw_lang = raw_lang[2:-1]
+    if "_" in raw_lang:
+        return raw_lang.split("_")[0]
+    else:
+        return raw_lang
+def any_to_base(lang):
+    clang = clean_lang(lang)
+    res = pycountry.languages.get(alpha_2=clang)
+    if res is None:
+        return pycountry.languages.get(alpha_3=clang)
+    else:
+        return res
+def base_to_nllb(lang_entry=None, lang_code=None):
+    if lang_code is None:
+        lang_code = lang_entry.alpha_3
+    try:
+        #script = iso3_to_script[lang_code]
+        return iso3_to_nllb[lang_code]
+    except KeyError:
+        script = guess_script(lang_code)
+        return f"{lang_code}_{script}"
+def base_to_madlad(lang_entry=None, lang_code=None):
+    if lang_code is None:
+        if hasattr(lang_entry, 'alpha_2'):
+            lang_code = lang_entry.alpha_2
+        else:
+            lang_code = lang_entry.alpha_3
+    return f"<2{lang_code}>"
+def any_to_something(lang, conv_func):
+    base = any_to_base(lang)
+    if base is None:
+        clang = clean_lang(lang)
+        return conv_func(None, clang)
+    else:
+        return conv_func(base)
+def run_test(src_list, tgt_list, conv_func, msg_prefix, verbose=False):
+    ok_count = 0
+    err_count = 0
+    fail_count = 0
+    for raw_c in src_list:
+        try:
+            test = conv_func(raw_c)
+            if test in tgt_list:
+                ok_count += 1
+            else:
+                fail_count += 1
+                if verbose:
+                    print("FAIL:", test)
+        except KeyError:
+            err_count += 1
+            if verbose:
+                print("ERR:", raw_c)
+    print(f"{msg_prefix}: {ok_count} good, {fail_count} fail, {err_count} err")
+def any_to_madlad(lang):
+    return any_to_something(lang, base_to_madlad)
+def any_to_nllb(lang):
+    return any_to_something(lang, base_to_nllb)
+def any_to_neurotolge(lang):
+    l = any_to_base(lang).alpha_3
+    return l if l != 'lvs' else 'lv'
+def any_to_mdl_type(mdl_type, lang):
+    if mdl_type == MDL_NLLB:
+        return any_to_nllb(lang)
+    elif mdl_type == MDL_MADLAD:
+        return any_to_madlad(lang)
+    elif mdl_type is None:
+        return lang
+    elif mdl_type == MDL_LLAMA:
+        return lang
+    else:
+        raise ValueError(f"Unknown mdl_type {mdl_type}")
+def langs_to_madlad(lang_set):
+    return [any_to_madlad(l) for l in lang_set] if lang_set is not None else []
+def langs_to_nllb(lang_set):
+    return [any_to_nllb(l) for l in lang_set] if lang_set is not None else []
+if __name__ == "__main__":
+    run_test(NLLB_CODES, MADLAD_CODES, any_to_madlad, "NLLB to MADLAD")
+    run_test(NLLB_CODES, NLLB_CODES, any_to_nllb, "NLLB to NLLB")
+    run_test(MADLAD_CODES, NLLB_CODES, any_to_nllb, "MADLAD TO NLLB")
+    run_test(MADLAD_CODES, MADLAD_CODES, any_to_madlad, "MADLAD TO MADLAD")
+def is_nllb(object):
+    """
+    Check if the object is an NLLB model or tokenizer
+    """
+    name = object.__class__.__name__.lower()
+    return "m2m100" in name or "nllb" in name
+def is_madlad(object):
+    """
+    Check if the object is a MADLAD model or tokenizer
+    """
+    return "t5" in object.__class__.__name__.lower()
+def is_dec_only_llm(obj):
+    lcname = obj.__class__.__name__.lower()
+    return any(k in lcname for k in ["pretrainedtokenizerfast", "llama", "gemma"])
+def get_mdl_type(obj):
+    obj = obj.module if hasattr(obj, "module") else obj
+    if is_nllb(obj):
+        return MDL_NLLB
+    elif is_madlad(obj):
+        return MDL_MADLAD
+    elif is_dec_only_llm(obj):
+        return MDL_LLAMA
+    else:
+        raise ValueError(f"Object {str(obj)[:200]} is not supported")
+def langs_to_mdl_type(mdl_type, lang_set):
+    if mdl_type == MDL_NLLB:
+        return langs_to_nllb(lang_set)
+    elif mdl_type == MDL_MADLAD:
+        return langs_to_madlad(lang_set)
+    elif mdl_type == MDL_LLAMA:
+        return lang_set
+    else:
+        raise ValueError(f"Model type {mdl_type} is not supported")
+def get_joshi_class(lang_code):
+    norm_code = any_to_base(lang_code)
+    if norm_code is None:
+        return "?"
+    else:
+        norm_code = norm_code.alpha_3
+    return _rev_joshi[norm_code]
+def lang_set_maybe_smugri(lang_def):
+    if lang_def == "smugri-low":
+        preresult = SMUGRI_LOW
+    elif lang_def == "smugri-high":
+        preresult = SMUGRI_HIGH
+    elif lang_def == "smugri":
+        preresult = SMUGRI
+    else:
+        preresult = lang_def
+    return set(preresult.split(","))
+def smugri_back(lang_list):
+    sll = sorted(lang_list)
+    sll_str = ",".join(sll)
+    if sll_str == SMUGRI_LOW:
+        return "smugri-low"
+    elif sll_str == SMUGRI_HIGH:
+        return "smugri-high"
+    elif sll_str == SMUGRI:
+        return "smugri-full"
+    else:
+        return sll_str

kuidastaltsutadalaamat/legacy/localizemodel.py ADDED Viewed

	@@ -0,0 +1,45 @@

+#!/usr/bin/env python3
+import os
+from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
+from modelops import mdl_param_count, is_gen_ai, hf_tok
+from tokops import train_or_extend_tokenizer_and_upd_model, save_postokens
+from aux import CmdlineArgs, log
+from legacy.langconv import lang_set_maybe_smugri
+def i_dont_like_global_scope_variable_dangers():
+    args = CmdlineArgs("Localize an existing HuggingFace model, possibly expanding the tokenizer",
+                       pos_arg_list=["mdl_id", "save_location"],
+                       kw_arg_dict={"tok_train_file": None,
+                                    "tok_mdl_id": None,
+                                    "new_langs": None,
+                                    "merge_tokenizers": 0,
+                                    "merge_tok_mdl_id": None })
+    if not args.tok_mdl_id:
+        args.tok_mdl_id = args.mdl_id
+    if os.path.exists(args.save_location):
+        raise Exception(f"Save location '{args.save_location}' already exists, don't want to overwrite")
+    if args.new_langs:
+        args.new_langs = lang_set_maybe_smugri(args.new_langs)
+    if is_gen_ai(args.mdl_id):
+        model = AutoModelForCausalLM.from_pretrained(args.mdl_id, token=hf_tok)
+    else:
+        model = AutoModelForSeq2SeqLM.from_pretrained(args.mdl_id, token=hf_tok)
+    tokenizer, added = train_or_extend_tokenizer_and_upd_model(args, model)
+    mdl_size, emb_size = mdl_param_count(model)
+    log(f"Cached model with {mdl_size} parameters" +
+          ("" if emb_size < 0 else f" of which {emb_size} ({100 * emb_size / mdl_size:.2f}%) are embeddings"))
+    tokenizer.save_pretrained(args.save_location)
+    save_postokens(added, args.save_location)
+    model.save_pretrained(args.save_location)
+if __name__ == '__main__':
+    i_dont_like_global_scope_variable_dangers()

kuidastaltsutadalaamat/legacy/modelops.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+from collections import namedtuple
+import torch
+from aux import log
+CouplingSpecTuple = namedtuple("CouplingSpecPair", ["lang_set", "tokenizer", "postokenizer", "model_id", "model"])
+hf_tok = None
+with open("../hf_token", 'r') as fh:
+    hf_tok = fh.read().strip()
+MODULE_CONFIG_FILE = "coupled_module_config.json"
+DATA_STATE_FILE = "data_state.json"
+LOSS_LIST_FILE = "loss_list.json"
+def mdl_param_count(model):
+    result = 0
+    embedding_size = -1
+    for n, p in model.named_parameters():
+        this_count = 1
+        for s in p.shape:
+            this_count *= s
+        result += this_count
+        # if n == "model.shared.weight":
+        if "shared.weight" in n:
+            embedding_size = this_count
+    return result, embedding_size
+"""
+def to_cpl_spec(langs, model, tokenizer, postokenizer, location):
+    mdl_type = get_mdl_type(tokenizer)
+    cpl_langs = set(langs_to_mdl_type(mdl_type, langs))
+    return [CouplingSpecTuple(cpl_langs, tokenizer, postokenizer, location, model)]
+def _save_json_config(model_dir, filename, data):
+    with open(os.path.join(model_dir, filename), "w") as f:
+        json.dump(data, f, indent=2, sort_keys=True)
+        f.write("\n")
+def _load_json_config(model_dir, filename):
+    try:
+        with open(os.path.join(model_dir, filename), "r") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        return None
+def save_module_config(model_dir, coupling_specs):
+    config = [{'lang_set': list(spec.lang_set), 'model_id': spec.model_id if i > 0 else model_dir} for i, spec in enumerate(coupling_specs)]
+    _save_json_config(model_dir, MODULE_CONFIG_FILE, config)
+def load_module_config(model_dir):
+    result = _load_json_config(model_dir, MODULE_CONFIG_FILE)
+    return result if result is not None else [{"model_id": model_dir, "lang_set": {}}]
+"""
+def save_all_models(location, model, tokenizer, cpl_specs=None, trainer=None):
+    if not os.path.exists(location):
+        os.makedirs(location)
+    if trainer is not None:
+        trainer.save_state(location)
+    model.config.save_pretrained(location)
+    model.generation_config.save_pretrained(location)
+    tokenizer.save_pretrained(location)
+    """
+    if cpl_specs is not None:
+        save_module_config(location, cpl_specs)
+    """
+def report_devices(msg = "", accelerator = None, mdl = None):
+    if torch.cuda.is_available():
+        # Get the visible devices from CUDA
+        visible_devices = torch.cuda.device_count()
+        #log(f"Number of visible GPUs: {visible_devices}")
+        msg = f"{msg:30} {visible_devices} GPUs:"
+        # List the actual GPUs being used
+        gpu_names = [torch.cuda.get_device_name(i) for i in range(visible_devices)]
+        for i, name in enumerate(gpu_names):
+            mem_alloc = torch.cuda.memory_allocated(i) / 1024**2
+            mem_res = torch.cuda.memory_reserved(i) / 1024**2
+            if mem_alloc > 0.01 or mem_res > 0.01:
+                msg += f"  {i}: alloc {mem_alloc:.2f} Mb / res {mem_res:.2f} Mb;"
+        log(msg, accelerator=accelerator)
+    elif accelerator is not None and accelerator.device.type == "mps":
+        mem_alloc = torch.mps.current_allocated_memory() / 1024**2
+        log(f"{msg:30} device being used: {accelerator.device}, mem alloc: {mem_alloc} Mb", accelerator=accelerator)
+    else:
+        log(f"No acceleration")
+    #if mdl is not None:
+    #    log(f"Model device: {mdl.device}", accelerator=accelerator)
+def is_gen_ai(mdl_id):
+    lc = mdl_id.lower()
+    return not ("madlad" in lc or "nllb" in lc or "m2m" in lc or "bart" in lc)

kuidastaltsutadalaamat/legacy/oldtrainllm.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env python3
+import os
+import json
+import torch
+import sys
+from accelerate import Accelerator
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from accel import SwitchingAccelerator
+from modelops import hf_tok, save_all_models
+from aux import log, CmdlineArgs
+from data import do_list_in_batches
+def _cmdline_args():
+    description = """Train or tune decoder models"""
+    result = CmdlineArgs(description,
+                         pos_arg_list=["mdl_id", "save_location", "train_file"],
+                         pos_arg_types=[str, str, str],
+                         kw_arg_dict={ "continue_training": False, "save_steps": 100, "lr": 1.5e-5,
+                            "batch_size": 1024, "nr_sents_per_gpu": 4, "log_steps": 1, "epochs": 4,
+                            "max_length": 3000 })
+    # if the directory args.save_location already exists, raise an exception:
+    if not result.continue_training and os.path.exists(result.save_location):
+        raise Exception(f"Save location '{result.save_location}' already exists, don't want to overwrite.")
+    if result.nr_sents_per_gpu == 0:
+        result.nr_sents_per_gpu = result.batch_size
+    return result
+def load_json_list(json_file):
+    with open(json_file, "r") as f:
+        data = json.load(f)
+        return data
+def load_hf_model(mdl_id, accelerator=None):
+    if accelerator is None:
+        model = AutoModelForCausalLM.from_pretrained(mdl_id, token=hf_tok, torch_dtype=torch.bfloat16)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(mdl_id, token=hf_tok, torch_dtype=torch.bfloat16, device_map=accelerator.device)
+    return model
+def load_hf_tokenizer(mdl_id):
+    tokenizer = AutoTokenizer.from_pretrained(mdl_id, token=hf_tok)
+    return tokenizer
+def _no_globals_main():
+    accelerator = Accelerator()
+    try:
+        args = _cmdline_args()
+        log(f"Num proc: {accelerator.num_processes}, proc ID: {accelerator.process_index}")
+        log("loading model", accelerator=accelerator)
+        mdl = load_hf_model(args.mdl_id)
+        log("loading tokenizer", accelerator=accelerator)
+        tok = load_hf_tokenizer(args.mdl_id)
+        log("loading data", accelerator=accelerator, all_threads=True)
+        train_set = load_json_list(args.train_file)
+        log("training", accelerator=accelerator)
+        acc_trainer = SwitchingAccelerator(train_set, args, mdl, tok, preinit_acc=accelerator)
+        upd_model = acc_trainer.train()
+        log("saving", accelerator=accelerator)
+        save_all_models(args.save_location, upd_model, tok)
+    except Exception as e:
+        # in multiprocess scenarios it is hard to read the stack trace, so just show one:
+        if accelerator.is_main_process:
+            raise e
+if __name__ == "__main__":
+    #sys.argv = "_ models/llama3.2-1b models/newmdl tmp.json".split()
+    #sys.argv = "_ models/llama3.2-1b models/newmdl2 tmpx.json batch_size=16 nr_sents_per_gpu=1 log_steps=1 save_steps=2000 epochs=1".split()
+    _no_globals_main()

kuidastaltsutadalaamat/legacy/parasynth.py ADDED Viewed

	@@ -0,0 +1,139 @@

+#!/usr/bin/env python3
+import sys
+import json
+from collections import defaultdict
+from benchmark import get_hyp_cache_dir, translate_all_hyps
+from inference import load_and_init_module_config
+from legacy.langconv import get_high_set, any_to_mdl_type, get_mdl_type
+from accelerate import Accelerator
+from aux import log
+def load_raw_data(path):
+    with open(path, 'r') as f:
+        return json.load(f)
+def save_raw_data(path, data):
+    with open(path, 'w') as f:
+        json.dump(data, f, indent=2)
+def apply_func_to_hires_snts(snt_set, func):
+    high_set = get_high_set()
+    for tupl in snt_set:
+        langs = [k for k in tupl if not "-dia" in k and k in high_set]
+        if langs:
+            revlangs = high_set - set(langs)
+            for revlang in revlangs:
+                for lang in langs:
+                    # translate sentences tupl[lang] from lang to revlang
+                    # OR
+                    # add the result as tupl[revlang]
+                    func(tupl, lang, revlang)
+def report_part_stats(part, part_index, num_parts):
+    hi_set = get_high_set()
+    num_snts = len(part['sentences'])
+    hires_langs = {k for k in part['sentences'][0] if "dia" not in k and k in hi_set}
+    num_hires_langs = len(hires_langs)
+    langs_to_do = hi_set - hires_langs
+    num_to_translate = num_hires_langs * len(langs_to_do)
+    log(f"Part {part_index + 1}/{num_parts}; {num_snts} sentences, num hires: {num_hires_langs}, to translate: {num_to_translate}")
+    return num_snts * num_hires_langs, num_snts * num_to_translate
+def add_hires_synth_data(mdl_id, corpus_in, corpus_out, dry=False):
+    accelerator = Accelerator()
+    log("Loading data", accelerator)
+    data = load_raw_data(corpus_in)
+    log("Loading model", accelerator)
+    if dry:
+        main_model, module_config = None, None
+        mdl_type = None
+    else:
+        main_model, module_config = load_and_init_module_config(mdl_id, accelerator)
+        mdl_type = get_mdl_type(main_model)
+    if accelerator.is_main_process:
+        _ = get_hyp_cache_dir(mdl_id, create=True)
+    l = len(data)
+    tot_snt = 0
+    tot_tr = 0
+    for i, part in enumerate(data):
+        tr_dict = defaultdict(lambda: defaultdict(lambda: None))
+        num_snt, num_tr = report_part_stats(part, i, l)
+        tot_snt += num_snt
+        tot_tr += num_tr
+        if not dry:
+            def _transfer(tup, src, tgt):
+                srcm = any_to_mdl_type(mdl_type, src)
+                tgtm = any_to_mdl_type(mdl_type, tgt)
+                lp = f"{srcm}-{tgtm}"
+                inp_snt = tup[src]
+                # this "touches" the value: if it was not there, now it is None
+                # and if it was there, then we use it
+                if tr_dict[lp][inp_snt] is not None:
+                    tup[tgt] = tr_dict[lp][inp_snt]
+            # collect sentences to translate
+            apply_func_to_hires_snts(part['sentences'], _transfer)
+            in_tr_dict_list = { lp: sorted(tr_dict[lp].items()) for lp in tr_dict }
+            log(f"Translating part {i+1}/{l}", accelerator)
+            #translate_cache_dict(tr_dict, mdl_id, module_config, corpus_in, accelerator)
+            translate_all_hyps(in_tr_dict_list, module_config, mdl_id, f"{corpus_in}-{i}", accelerator)
+            log(f"Collecting part {i+1}/{l}", accelerator)
+            out_tr_dict_list = translate_all_hyps(in_tr_dict_list, module_config, mdl_id, corpus_in)
+            for lp in out_tr_dict_list:
+                for inp, outp in out_tr_dict_list[lp]:
+                    tr_dict[lp][inp] = outp
+            # put translations back into data structure
+            log(f"Integrating part {i+1}/{l}", accelerator)
+            apply_func_to_hires_snts(part['sentences'], _transfer)
+    log(f"Total sentences: {tot_snt}, total to generate: {tot_tr}", accelerator)
+    if not dry:
+        log("Saving data", accelerator)
+        save_raw_data(corpus_out, data)
+if __name__ == '__main__':
+    try:
+        mdl_id_param = sys.argv[1]
+        corpus_param = sys.argv[2]
+        corpus_output_param = sys.argv[3]
+    except IndexError:
+        mdl_id_param = "models/nllb600m"
+        corpus_param = "data/flt.json"
+        corpus_output_param = "data/fltout.json"
+    try:
+        _ = sys.argv[4]
+        dry_run = True
+    except IndexError:
+        dry_run = False
+    add_hires_synth_data(mdl_id_param, corpus_param, corpus_output_param, dry_run)

kuidastaltsutadalaamat/legacy/pretok.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#!/usr/bin/env python3
+import os
+from data import MultilingualBatchingCachingDataset
+from aux import log, CmdlineArgs
+from legacy.langconv import lang_set_maybe_smugri
+from modelops import to_cpl_spec
+from tokops import load_tokenizer
+"""
+def load_hf_tok(mdl_id, tok_id=None, verbose=False):
+    if tok_id is None:
+        tok_id = mdl_id
+    tokenizer = AutoTokenizer.fromm_pretrained(tok_id, token=hf_tok)
+    return tokenizer
+"""
+def _cmdline_args():
+    description = """Pre-tokenize data and cache the results"""
+    pos_args = ["mdl_id", "train_file", "langs", "cache_path"]
+    pos_types = [str, str, lang_set_maybe_smugri, str]
+    kw_args = { "anchor_mdl_id": None, "anchor_langs": None, "batch_size": 16, "shard_size": 100000,
+                "exclude_set": None, "max_snt_len": 1024, "sort_by_len": False }
+    #post-process the arguments
+    args = CmdlineArgs(description, pos_arg_list=pos_args, pos_arg_types=pos_types, kw_arg_dict=kw_args)
+    if args.anchor_langs is not None:
+        args.anchor_langs = lang_set_maybe_smugri(args.anchor_langs)
+    # if the directory args.save_location already exists, raise an exception:
+    if os.path.exists(args.cache_path):
+        raise Exception(f"Save location '{args.cache_path}' already exists, don't want to overwrite")
+    log(f"Launched as {args}")
+    return args
+def oh_look_another_do_main_function():
+    args = _cmdline_args()
+    log("loading tokenizer")
+    main_tokenizer, main_postok = load_tokenizer(args.mdl_id) #load_hf_tok(args.mdl_id, verbose=True)
+    coupling_specs = to_cpl_spec(args.langs, None, main_tokenizer, main_postok, None)
+    if args.anchor_mdl_id is not None:
+        log("loading anchor model tokenizer")
+        anchor_tokenizer, anc_postok = load_tokenizer(args.anchor_mdl_id)
+        coupling_specs += to_cpl_spec(args.anchor_langs, None, anchor_tokenizer, anc_postok, None)
+    mbd = MultilingualBatchingCachingDataset(args.train_file, coupling_specs, args)
+    mbd.load_and_cache_data(args.cache_path)
+if __name__ == "__main__":
+    oh_look_another_do_main_function()

kuidastaltsutadalaamat/legacy/testmem.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#!/usr/bin/env python3
+import torch.optim
+import sys
+import subprocess
+import random
+from accelerate import Accelerator
+from transformers import AutoModelForCausalLM, get_scheduler, AutoModelForSeq2SeqLM
+from datasets import load_dataset
+from aux import CmdlineArgs, log
+from legacy.langconv import is_dec_only_llm
+from modelops import report_devices, hf_tok
+from tokops import load_tokenizer, tokenizeit
+def run_test(mdl_id, batch_sizes, ctxlen, acc):
+    #state = AcceleratorState()
+    log(f"Num proc: {acc.num_processes}, proc ID: {acc.process_index}")
+    report_devices("Initial state:", accelerator=acc)
+    t, pt = load_tokenizer(mdl_id) # AutoTokenizer.from_mpretrained(mdl_id, token=hf_tok)
+    if is_dec_only_llm(t):
+        m = AutoModelForCausalLM.from_pretrained(mdl_id, token=hf_tok, torch_dtype=torch.bfloat16)
+        log("Decoder-only model")
+    else:
+        m = AutoModelForSeq2SeqLM.from_pretrained(mdl_id, token=hf_tok, torch_dtype=torch.bfloat16)
+        log("Encoder-decoder model")
+    opt = torch.optim.AdamW(m.parameters(), lr=1e-5)
+    lrs = get_scheduler("linear", optimizer=opt, num_warmup_steps=100, num_training_steps=1000)
+    opt, lrs, m = acc.prepare(opt, lrs, m)
+    report_devices("Models in VRAM:", accelerator=acc)
+    m.train()
+    ds = load_dataset("Helsinki-NLP/europarl", "en-et")
+    max_idx = len(ds['train'])
+    for batch_size in batch_sizes:
+        print("")
+        for _ in range(10):
+            inp_idx = random.randint(0, max_idx-batch_size)
+            raw_inp = [ds['train'][i]['translation']['et'] for i in range(inp_idx, inp_idx+batch_size)]
+            if is_dec_only_llm(t):
+                inp = tokenizeit((t, pt), raw_inp, ctxlen, is_target=False, is_llm=True)
+            else:
+                inp = tokenizeit((t, pt), raw_inp, ctxlen, is_target=False, is_llm=False)
+            inp['labels'] = inp['input_ids']
+            inp.to(m.device)
+            outputs = m(**inp)
+            loss = outputs.loss
+            report_devices(f"While training:", accelerator=acc)
+            log(f"Batches    : {[inp[k].size() for k in 'input_ids labels attention_mask'.split(' ')]}")
+            log(f"Batch total: {sum([inp[k].size()[0] * inp[k].size()[1] for k in 'input_ids labels attention_mask'.split(' ')])}")
+            try:
+                if acc.is_main_process:
+                    result = subprocess.run(['rocm-smi'], capture_output=True, text=True)
+                    print(result.stdout)
+            except:
+                pass
+            acc.backward(loss)
+            acc.wait_for_everyone()
+        report_devices(f"Models gradients in VRAM, batch size {batch_size}:", accelerator=acc)
+    print(f"Testing {mdl_id} with batch size {batch_size}: success!")
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        args = CmdlineArgs("Test the VRAM usage by a model with different batch sizes, comma-separated",
+                           pos_arg_list=["mdl_id", "batch_sizes"],
+                           kw_arg_dict={"ctxlen": 2048})
+        clean_bs = [int(bs) for bs in args.batch_sizes.split(",")]
+        mdl_id = args.mdl_id
+        ctxlen = args.ctxlen
+    else:
+        mdl_id = "meta-llama/Llama-3.2-1B"
+        clean_bs = [16, 32, 64]
+        ctxlen = 2048
+    acc = Accelerator()
+    try:
+        run_test(mdl_id, clean_bs, ctxlen, acc)
+    except Exception as e:
+        if acc.is_main_process:
+            raise e

kuidastaltsutadalaamat/legacy/tokops.py ADDED Viewed

	@@ -0,0 +1,350 @@

+#!/usr/bin/env python3
+import os
+import sentencepiece as spm
+import json
+from transformers import AutoTokenizer
+from transformers.models.nllb import NllbTokenizer
+from transformers.models.t5 import T5Tokenizer
+from collections import defaultdict
+from aux import log
+from legacy.langconv import langs_to_madlad, langs_to_nllb, is_nllb, is_madlad, is_dec_only_llm
+from modelops import hf_tok
+def test_tok(tok, snt, lang):
+    tok.src_lang = lang
+    out = tok(text = snt)
+    print(out['input_ids'])
+    print(tok.tokenize(snt))
+    print(tok.convert_ids_to_tokens(out['input_ids']))
+    print("-")
+def get_stupid_correction(mdl_id):
+    l_mdl_id = mdl_id.lower()
+    if "m2m" in l_mdl_id:
+        correction = 108
+    elif "nllb" in l_mdl_id:
+        correction = 2
+    else:
+        correction = 0
+    return correction
+def tsv_to_json_vocab(location):
+    new_location = location + ".json"
+    with open(location, "r") as f, open(new_location, "w") as w:
+        idx_dict = { "<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3 }
+        for line in f:
+            tok, _ = line.strip().split("\t")
+            if tok not in idx_dict:
+                idx_dict[tok] = len(idx_dict)
+        json.dump(idx_dict, w)
+    return new_location
+def get_unk_toks(tokenizer, corpus, verbose=False):
+    unk_id = tokenizer.unk_token_id
+    unk_toks = defaultdict(int)
+    all_toks = set()
+    total_count = 0
+    unk_count = 0
+    with open(corpus, "r", encoding='utf-8') as f:
+        for snt in f:
+            toks = tokenizer.tokenize(snt.strip())
+            ids = tokenizer.convert_tokens_to_ids(toks)
+            for t, i in zip(toks, ids):
+                if i == unk_id:
+                    unk_toks[t] += 1
+                    unk_count += 1
+                total_count += 1
+                all_toks.add(t)
+    if verbose:
+        print(f"Tokenizer vocab size: {tokenizer.vocab_size}, nr of actually used tokens: {len(all_toks)}")
+        print(f"Corpus token count: {total_count}, UNK token percentage: {100*unk_count/total_count:.2f}%")
+    return list(unk_toks)
+def get_top_toks(tokenizer, corpus, num_top_toks):
+    freq_count = defaultdict(int)
+    with open(corpus, "r", encoding='utf-8') as f:
+        for snt in f:
+            toks = tokenizer.tokenize(snt.strip())
+            for t in toks:
+                freq_count[t] += 1
+    sorted_freq_count = sorted(freq_count.keys(), key=lambda x: -freq_count[x])
+    return sorted_freq_count[:num_top_toks]
+def extend_tok_langs(tokenizer, lang_set_raw):
+    if is_nllb(tokenizer):
+        lang_set = langs_to_nllb(lang_set_raw)
+    elif is_madlad(tokenizer):
+        lang_set = langs_to_madlad(lang_set_raw)
+    elif is_dec_only_llm(tokenizer):
+        return
+    else:
+        raise NotImplementedError
+    if 'additional_special_tokens' in tokenizer.special_tokens_map:
+        orig_langs = tokenizer.special_tokens_map['additional_special_tokens']
+        orig_lang_set = set(orig_langs)
+        addable_langs = list(set(lang_set) - orig_lang_set)
+    else:
+        orig_langs = []
+        addable_langs = lang_set
+    tokenizer.add_special_tokens({'additional_special_tokens': orig_langs + addable_langs})
+def wrap_tok_in_correct_class(location, base_model_id, lang_set):
+    l_base_mdl_id = base_model_id.lower()
+    if "nllb" in l_base_mdl_id:
+        nllb_lang_set = langs_to_nllb(lang_set)
+        return NllbTokenizer(location + ".model", additional_special_tokens=nllb_lang_set)
+    elif "madlad" in l_base_mdl_id or "t5" in l_base_mdl_id:
+        madlad_lang_set = langs_to_madlad(lang_set)
+        return T5Tokenizer(location + ".model", additional_special_tokens=madlad_lang_set)
+    else:
+        raise ValueError("Incompatible model type for tokenizer")
+def remove_tmp_spm_files(location):
+    for tmp_file in (".vocab", ".model"):
+        os.remove(location + tmp_file)
+def learn_spm_tokenizer(corpus, save_location, base_model_id, vocab_size, lang_set=None):
+    tmp_location = os.path.join(save_location, "sentencepiece.bpe.tmp")
+    os.makedirs(save_location, exist_ok=True)
+    spm.SentencePieceTrainer.train(input=corpus, model_prefix=tmp_location, vocab_size=vocab_size)
+    tok = wrap_tok_in_correct_class(tmp_location, base_model_id, lang_set)
+    remove_tmp_spm_files(tmp_location)
+    return tok
+def do_new_tok(tokargs):
+    correction = get_stupid_correction(tokargs.mdl_id)
+    voc_size = tokargs.vocab_size - correction
+    location = tokargs.save_location
+    return learn_spm_tokenizer(tokargs.tok_train_file, location, base_model_id=tokargs.tok_mdl_id,
+                               vocab_size=voc_size, lang_set=tokargs.new_langs)
+def remove_known_toks(toks, tokenizer):
+    return [t for t in toks if not t in tokenizer.get_vocab()]
+def _handle_new_tokenizer(args):
+    assert args.new_langs is not None, "lang_set must be provided"
+    assert args.tok_train_file is not None, "tok_train_file must be provided"
+    args.vocab_size = int(args.vocab_size)
+    log("Training new tokenizer")
+    tokenizer = do_new_tok(args)
+    return tokenizer
+def get_postoken_filename(save_location):
+    return os.path.join(save_location, "postokens.json")
+def save_postokens(added_tokens, location):
+    if added_tokens is not None:
+        os.makedirs(location, exist_ok=True)
+        with open(get_postoken_filename(location), "w") as f:
+            json.dump(added_tokens, f)
+def _handle_adding_tokens(tokenizer, toks_to_add, args):
+    if len(toks_to_add) == 0:
+        return None
+    log(f"Adding tokens: {toks_to_add}")
+    base_idx = len(tokenizer)
+    added_tok_dict = { t: (base_idx + i) for i, t in enumerate(toks_to_add) }
+    added_tok_rev_dict = { int(i): t for t, i in added_tok_dict.items() }
+    comb_dict = { 'tok2idx': added_tok_dict, 'idx2tok': added_tok_rev_dict }
+    save_postokens(comb_dict, args.save_location)
+    return comb_dict
+def _handle_existing_tokenizer(args):
+    log("Reusing existing tokenizer")
+    tokenizer, added_tokens = load_tokenizer(args.tok_mdl_id)
+    if args.new_langs is not None:
+        log("Extending existing tokenizer with languages")
+        extend_tok_langs(tokenizer, args.new_langs)
+    if args.merge_tokenizers or args.merge_tok_mdl_id:
+        """
+        assert args.tok_train_file is not None, "For merging tokenizers a text file must be provided" \
+                                                + " to find the top N tokens to merge"
+        assert args.merge_tokenizers is not None and args.merge_tok_mdl_id is not None, \
+            "Both merge_tokenizers and merge_tok_mdl_id must be provided"
+        """
+        raise NotImplementedError("Merging is currently not supported")
+    added_tok_count = 0
+    if args.tok_train_file:
+        if args.merge_tokenizers:
+            """
+            merge_tok_max = int(args.merge_tokenizers)
+            log(f"Extending existing tokenizer ({args.merge_tok_mdl_id}) with up to {merge_tok_max} top tokens" +
+                f" from another tokenizer and corpus ({args.tok_train_file})")
+            new_tok = AutoTokenizer.from_pretrained(args.merge_tok_mdl_id, token=hf_tok)
+            toks_to_maybe_add = get_top_toks(new_tok, args.tok_train_file, merge_tok_max)
+            """
+            raise NotImplementedError("Merging is currently not supported")
+        else:
+            log(f"Extending existing tokenizer with UNK tokens from corpus ({args.tok_train_file})")
+            toks_to_maybe_add = get_unk_toks(tokenizer, args.tok_train_file, verbose=True)
+        toks_to_add = remove_known_toks(toks_to_maybe_add, tokenizer)
+        added_tok_count = len(toks_to_add)
+        added_tokens = _handle_adding_tokens(tokenizer, toks_to_add, args)
+    return tokenizer, added_tok_count, added_tokens
+def train_or_extend_tokenizer_and_upd_model(args, model):
+    if hasattr(args, "vocab_size") and args.vocab_size:
+        # train a new sentence-piece tokenizer
+        tokenizer = _handle_new_tokenizer(args)
+        added_tok_count = 0
+        added_dict = None
+    else:
+        # save the pre-trained model's tokenizer, possibly adding new languages and tokens
+        tokenizer, added_tok_count, added_dict = _handle_existing_tokenizer(args)
+    upd_amt = get_stupid_correction(args.mdl_id)
+    new_len = len(tokenizer) + added_tok_count
+    model.resize_token_embeddings(new_len + upd_amt)
+    return tokenizer, added_dict
+def load_tokenizer(tok_mdl_id):
+    orig_tokenizer = AutoTokenizer.from_pretrained(tok_mdl_id, token=hf_tok)
+    postoken_file = get_postoken_filename(tok_mdl_id)
+    if os.path.exists(postoken_file):
+        with open(postoken_file, "r") as f:
+            postokens = json.load(f)
+    else:
+        postokens = None
+    return orig_tokenizer, postokens
+def tokenize_batch(tokenizer, sntlist, maxlen=8000):
+    #tokenizer.pad_token = '<|reserved_special_token_0|>'
+    tokenizer.pad_token = tokenizer.eos_token
+    output = tokenizer(sntlist, return_tensors="pt", max_length=maxlen, truncation=True, add_special_tokens=True,
+                          padding=True)
+    output["labels"] = output["input_ids"].detach().clone()
+    return output
+"""
+def detokenizeit(toktup, tok_ids):
+    #return toktup[0].decode(tok_ids, skip_special_tokens=True)
+    toks = []
+    for tok_id_tensor in tok_ids:
+        tok_id = tok_id_tensor.item()
+        try:
+            if tok_id not in toktup[0].added_tokens_decoder:
+                toks.append(toktup[0].convert_ids_to_tokens(tok_id))
+        except IndexError:
+            toks.append(toktup[1]['idx2tok'][str(tok_id)])
+    result = "".join(toks).replace("▁", " ")[1:]
+    return result, toks
+def detokenizemany(toktup, tok_mtx):
+    result = [detokenizeit(toktup, tok_ids)[0] for tok_ids in tok_mtx]
+    return result
+def run_tokenizer_testing():
+    args = CmdlineArgs("Test a tokenizer: tokenize & de-tokenize some text and check if these match",
+                       pos_arg_list=["tok_mdl_id", "txt_file"])
+    #tokenizer = AutoTokenizer.fromm_pretrained(args.tok_mdl_id, token=hf_tok)    if os.path.exists()
+    toktup = load_tokenizer(args.tok_mdl_id)
+    success = 0
+    failure = 0
+    with open(args.txt_file, "r", encoding="utf-8") as f:
+        snts = f.read().split("\n")
+        toks = tokenizeit(toktup, snts, 1024, False)
+        for i, snt in enumerate(snts):
+            tok_ids = toks['input_ids'][i]
+            #detoks = toktup[0].decode(tok_ids, skip_special_tokens=True)
+            detoks, tok_strs = detokenizeit(toktup, tok_ids)
+            if detoks != snt:
+                failure += 1
+                #log(f"Tokens:   {toktup[0].convert_ids_to_tokens(tok_ids)}")
+                log(f"Tokens:   {tok_strs}")
+                log(f"Test failed:\n{snt} !=\n{detoks}")
+            else:
+                success += 1
+            i += 1
+    log(f"Test result: {success} successful / {failure} failed")
+if __name__ == "__main__":
+    sys.argv = ['', 'models/nllbxt', 'data/tok-test.txt']
+    run_tokenizer_testing()
+"""

kuidastaltsutadalaamat/legacy/trainmodel.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python3
+import os
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
+from legacy.accel import SwitchingAccelerator
+from accelerate import Accelerator
+from data import MultilingualDatasetIterator
+from aux import log, CmdlineArgs
+from legacy.langconv import lang_set_maybe_smugri, is_dec_only_llm
+from modelops import mdl_param_count, to_cpl_spec, hf_tok
+from tokops import load_tokenizer
+def freeze_model(model):
+    for n, p in model.named_parameters():
+        p.requires_grad = False
+def load_hf_mdl_and_tok(mdl_id, tok_id=None, verbose=False):
+    if tok_id is None:
+        tok_id = mdl_id
+    tokenizer = load_tokenizer(tok_id) # AutoTokenizer.fromm_pretrained(tok_id, token=hf_tok)
+    if is_dec_only_llm(tokenizer[0]):
+        model = AutoModelForCausalLM.from_pretrained(mdl_id, token=hf_tok, torch_dtype=torch.bfloat16)
+    else:
+        model = AutoModelForSeq2SeqLM.from_pretrained(mdl_id, token=hf_tok, torch_dtype=torch.bfloat16)
+    if verbose:
+        mdl_size, _ = mdl_param_count(model)
+        log(f"Loaded {mdl_id} with {mdl_size} params, voc size {model.config.vocab_size}")
+    return model, tokenizer
+def _cmdline_args():
+    description = """Train or tune models"""
+    pos_args = ["mdl_id", "save_location", "train_pretok_file", "langs"]
+    pos_types = [str, str, str, lang_set_maybe_smugri]
+    kw_args = { "anchor_mdl_id": None, "anchor_langs": None, "continue_training": False,
+                "save_steps": 100000, "lr": 1.5e-5, "nr_snts_in_batch": 0, "nr_words_in_batch": 0,
+                "log_steps": 100, "epochs": 4 }
+    #post-process the arguments
+    args = CmdlineArgs(description, pos_arg_list=pos_args, pos_arg_types=pos_types, kw_arg_dict=kw_args)
+    if args.anchor_langs is not None:
+        args.anchor_langs = lang_set_maybe_smugri(args.anchor_langs)
+    if (args.nr_snts_in_batch > 0) == (args.nr_words_in_batch > 0):
+        raise Exception(f"Specify the batch size either in words or in sentences.")
+    # if the directory args.save_location already exists, raise an exception:
+    if not args.continue_training and os.path.exists(args.save_location):
+        raise Exception(f"Save location '{args.save_location}' already exists, don't want to overwrite.")
+    return args
+def yes_i_called_this_function_do_main():
+    args = _cmdline_args()
+    tmp_acc = Accelerator()
+    log(f"Num proc: {tmp_acc.num_processes}, proc ID: {tmp_acc.process_index}")
+    log("loading coupled model and tokenizer", accelerator=tmp_acc)
+    main_model, main_tokenizer = load_hf_mdl_and_tok(args.mdl_id, verbose=True)
+    coupling_specs = to_cpl_spec(args.langs, main_model, main_tokenizer[0], main_tokenizer[1], args.save_location)
+    if args.anchor_mdl_id:
+        log("loading anchor model and tokenizer", accelerator=tmp_acc)
+        anchor_model, anchor_tokenizer = load_hf_mdl_and_tok(args.anchor_mdl_id, verbose=True)
+        freeze_model(anchor_model)
+        coupling_specs += to_cpl_spec(args.anchor_langs, anchor_model, anchor_tokenizer[0], anchor_tokenizer[1], args.anchor_mdl_id)
+    train_set = MultilingualDatasetIterator(args.train_pretok_file)
+    acc_trainer = SwitchingAccelerator(coupling_specs, train_set, args)
+    upd_model, loss_list = acc_trainer.train()
+    #save_all_models(args.save_location, upd_model, main_tokenizer, coupling_specs, loss_list, trainer=acc_trainer.accelerator)
+if __name__ == "__main__":
+    #sys.argv = ". models/smol models/smol_next data/smugri4a-dev.json-tokcache/thiscache.json smugri log_steps=1 lr=1e-5".split()
+    #sys.argv = ". models/llama3.2-1b models/llama-tuned data/smugri4a-dev.json-tokcache/llama.json smugri".split()
+    yes_i_called_this_function_do_main()

kuidastaltsutadalaamat/legacy/translate_backup.py ADDED Viewed

	@@ -0,0 +1,309 @@

+#!/usr/bin/env python3
+"""
+import sys
+import requests
+import re
+import torch
+from aux import CmdlineArgs, log
+#from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
+from trainllm import load_hf_tokenizer, load_hf_model
+from data import do_list_in_batches
+from modelops import hf_tok, is_gen_ai
+from collections import defaultdict
+from langconv import is_nllb, is_madlad, any_to_mdl_type, get_mdl_type, any_to_neurotolge, is_dec_only_llm
+from tokops import load_tokenizer, tokenizeit, detokenizemany
+def prepare_for_translation(provided_inputs, toktup, input_language, output_language=None, device=None):
+    if is_nllb(toktup[0]):
+        toktup[0].src_lang = input_language
+        inputs_to_process = provided_inputs
+    elif is_madlad(toktup[0]):
+        madlad_tgt_lang = output_language
+        inputs_to_process = [f"{madlad_tgt_lang} {inp}" for inp in provided_inputs]
+    else:
+        raise NotImplementedError("Model type not supported")
+    prepared_inputs = tokenizeit(toktup, inputs_to_process, 1024, False) #tokenizer(inputs_to_process, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    if device is not None:
+        prepared_inputs.to(device)
+    frc_bos = toktup[0].get_lang_id(output_language) if output_language is not None else None
+    return prepared_inputs, frc_bos
+def finalize_translation(outputs, toktup):
+    result = detokenizemany(toktup, outputs) # tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    return result
+def loadmodel(mdlname="facebook/m2m100_418M", accelerator=None):
+    cl = AutoModelForCausalLM if is_gen_ai(mdlname) else AutoModelForSeq2SeqLM
+    if accelerator is not None:
+        model = cl.from_pretrained(mdlname, token=hf_tok, torch_dtype=torch.bfloat16)
+        model = accelerator.prepare(model)
+    else:
+        model = cl.from_pretrained(mdlname, token=hf_tok, torch_dtype=torch.bfloat16, device_map="auto")
+    return model
+def encode(model, input_batch):
+    model = model.module if hasattr(model, "module") else model
+    if is_nllb(model):
+        enc = model.model.encoder
+    elif is_madlad(model):
+        enc = model.base_model.encoder
+    else:
+        raise NotImplementedError(f"Model {model} is not supported yet.")
+    inputs_without_labels = { k: input_batch[k] for k in input_batch if k != "labels" }
+    return enc(**inputs_without_labels)
+def coupled_encode(coupling_specs, lang_to_bin, input_lang, input_texts, debug=False):
+    mdl_type = get_mdl_type(coupling_specs[0].model)
+    conv_input_lang = any_to_mdl_type(mdl_type, input_lang)
+    this = coupling_specs[lang_to_bin[conv_input_lang]]
+    # 0. input text --> input token IDs
+    these_inputs, _ = prepare_for_translation(input_texts, (this.tokenizer, this.postokenizer), conv_input_lang, device=this.model.device)
+    attention_mask = these_inputs["attention_mask"]
+    if debug:
+        for iii in range(len(input_texts)):
+            toklist = []
+            for tok_idx in these_inputs['input_ids'][iii]:
+                try:
+                    tok = this.tokenizer.convert_ids_to_tokens([tok_idx])[0]
+                except IndexError:
+                    tok = this.postokenizer['idx2tok'][str(tok_idx.item())]
+                toklist.append(tok)
+            print(these_inputs['input_ids'][iii])
+            print(toklist)
+    # 1. input token IDs --> encoder vectors
+    #embeddings = this.model.model.encoder(**these_inputs)
+    return encode(this.model, these_inputs), attention_mask
+def postproc_llm_output(raw_outputs, tok):
+    eos_id = tok.convert_tokens_to_ids(tok.eos_token)
+    for i, _ in enumerate(raw_outputs):
+        repl = None
+        for ii, t in enumerate(raw_outputs[i]):
+            if t.item() == eos_id:
+                repl = eos_id
+            if repl is not None:
+                raw_outputs[i][ii] = repl
+    return raw_outputs
+def llm_generate(coupling_specs, input_language, output_language, input_texts, debug=False):
+    mdl_type = get_mdl_type(coupling_specs[0].model)
+    conv_input_lang = any_to_mdl_type(mdl_type, input_language)
+    conv_output_lang = any_to_mdl_type(mdl_type, output_language)
+    tokenizer = coupling_specs[0].tokenizer
+    prep_texts = [make_gen_text(conv_input_lang, conv_output_lang, input_txt, None) for input_txt in input_texts]
+    tokenized = tokenizeit((tokenizer, None), prep_texts, 1024, is_target=False, is_llm=True)
+    obj = coupling_specs[0].model
+    obj = obj.module if hasattr(obj, "module") else obj
+    tokenized['input_ids'] = tokenized['input_ids'].to(obj.device)
+    tokenized['attention_mask'] = tokenized['attention_mask'].to(obj.device)
+    raw_outputs = obj.generate(**tokenized, max_length)
+    # 3. output token IDs --> output text
+    pre_result = tokenizer.batch_decode(postproc_llm_output(raw_outputs, tokenizer), skip_special_tokens=True)
+    result = [raw_out[len(prep_texts[i]):].split("\n")[0] for i, raw_out in enumerate(pre_result)]
+    """
+#    for i, raw_out in enumerate(pre_result):
+#        print("====")
+#        print(i, raw_out)
+#        print("%%%%")
+#        print(raw_out[len(prep_texts[i])-3:])
+#        print("----")
+    """
+    return result
+def coupled_generate(coupling_specs, lang_to_bin, output_lang, encoder_embeddings, att_mask, debug=False):
+    mdl_type = get_mdl_type(coupling_specs[0].model)
+    conv_output_lang = any_to_mdl_type(mdl_type, output_lang)
+    dec_idx = lang_to_bin[conv_output_lang]
+    tokenizer = coupling_specs[dec_idx].tokenizer
+    # 2. encoder vectors --> output token IDs
+    frc_bos = tokenizer.convert_tokens_to_ids(conv_output_lang)
+    obj = coupling_specs[dec_idx].model
+    obj = obj.module if hasattr(obj, "module") else obj
+    raw_outputs = obj.generate(forced_bos_token_id=frc_bos, encoder_outputs=encoder_embeddings, attention_mask=att_mask)
+    if debug:
+        for rwout in raw_outputs:
+            print(rwout)
+            print(tokenizer.convert_ids_to_tokens(rwout))
+    # 3. output token IDs --> output text
+    result = finalize_translation(raw_outputs, (tokenizer, coupling_specs[dec_idx].postokenizer))
+    return result
+def make_uniq(lang_to_bin):
+    result = defaultdict(lambda: 0)
+    for lang in lang_to_bin:
+        bin_set = lang_to_bin[lang]
+        result[lang] = 0 if 0 in bin_set else list(bin_set)[0]
+    return result
+def translate_with_neurotolge(translation_input: str, src_lang: str, tgt_lang: str) -> dict:
+    url = "https://api.tartunlp.ai/translation/v2"
+    payload = {
+        "text": translation_input,
+        "src": any_to_neurotolge(src_lang),
+        "tgt": any_to_neurotolge(tgt_lang),
+        "domain": "general",
+        "application": "benchmarking"
+    }
+    error = None
+    for i in range(5):
+        try:
+            response = requests.post(url, json=payload)
+            response.raise_for_status()  # Raise an error for bad status codes
+            return response.json()['result']
+        except requests.exceptions.RequestException as e:
+            error = {"error": str(e)}
+    return error
+def remove_dia(snt):
+    if ">" in snt:
+        return re.sub(r'^<[^>]+> ', '', snt)
+    else:
+        return snt
+def neurotolge_in_batches(input_texts, src_lang, tgt_lang):
+    neurotolge_langs = {'eng', 'est', 'ger', 'lit', 'lav', 'lvs', 'fin', 'rus', 'ukr', 'kca', 'koi', 'kpv', 'krl', 'lud', 'mdf', 'mhr', 'mns', 'mrj', 'myv', 'olo', 'udm', 'vep', 'liv', 'vro', 'sma', 'sme', 'smn', 'sms', 'smj', 'nor', 'hun'}
+    if src_lang in neurotolge_langs and tgt_lang in neurotolge_langs:
+        all_outputs = list()
+        for inp_batch in do_list_in_batches(input_texts, 8):
+            inp_batch_no_dia = [remove_dia(s) for s in inp_batch]
+            these_outputs = translate_with_neurotolge(inp_batch_no_dia, src_lang, tgt_lang)
+            if len(these_outputs) != len(inp_batch_no_dia):
+                raise Exception(f"Something went wrong.: {src_lang}/{tgt_lang}/{these_outputs}")
+            all_outputs += these_outputs
+            log(f"Translated {len(all_outputs)}/{len(input_texts)} sentences")
+        return all_outputs
+    else:
+        return None
+def coupled_translate(coupling_specs, input_texts, input_language, output_language, debug=False):
+    lang_to_bin = make_uniq(lang_bin_mapping(coupling_specs))
+    all_outputs = list()
+    for inp_batch in do_list_in_batches(input_texts, 32):
+        if is_dec_only_llm(coupling_specs[0].tokenizer):
+            these_outputs = llm_generate(coupling_specs, input_language, output_language, input_texts, debug=debug)
+        else:
+            encoder_embeddings, att_mask = coupled_encode(coupling_specs, lang_to_bin, input_language, inp_batch, debug=debug)
+            these_outputs = coupled_generate(coupling_specs, lang_to_bin, output_language, encoder_embeddings, att_mask, debug=debug)
+        all_outputs += these_outputs
+    return all_outputs
+def load_and_init_module_config(model_id, accelerator=None):
+    config = load_module_config(model_id)
+    coupling_specs = list()
+    main_model = None
+    for i, entry in enumerate(config):
+        lang_set = entry["lang_set"]
+        model_id = entry["model_id"] if i > 0 else model_id
+        log(f"Loading model and tokenizer from '{model_id}'")
+        model = loadmodel(model_id, accelerator)
+        tokenizer, postok = load_tokenizer(model_id)
+        if i == 0:
+            main_model = model
+        #(langs, model, tokenizer, location):
+        coupling_specs += to_cpl_spec(lang_set, model, tokenizer, postok, model_id)
+    return main_model, coupling_specs
+def _cmdline_args(inputs):
+#    description = ""Translate STDIN text with a translation model""
+    pos_args = ["mdl_id", "from_lang", "to_lang"]
+    #post-process the arguments
+    args = CmdlineArgs(description, pos_args, input_args=inputs, kw_arg_dict={"debug": False})
+    log(f"Launched as {args}")
+    return args
+def and_i_called_this_function_do_main_too(iv):
+    args = _cmdline_args(iv)
+    inputs = [line.strip() for line in sys.stdin]
+    # inputs = ["See on ikka tore uudis.", "Ma ikka katsetaks ka täpitähtedega tõlkimist.", "Mis tähed on täpitähed?"]
+    log(f"Inputs: {inputs}")
+    main_model, module_config = load_and_init_module_config(args.mdl_id)
+    log("Model loaded, starting to translate")
+    outputs = coupled_translate(module_config, inputs, args.from_lang, args.to_lang, debug=args.debug)
+    print("\n".join(outputs))
+    log("Done...")
+if __name__ == "__main__":
+    input_values = sys.argv[1:] if len(sys.argv) > 1 \
+        else ["models/nllb", "et", "en"]
+    and_i_called_this_function_do_main_too(input_values)
+"""

kuidastaltsutadalaamat/metrics.py ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env python3
+from data import read_input
+from aux import log
+import sys
+from collections import defaultdict
+from evaluate import load as load_metric
+SMUGRI_RES = {
+    'high': set("Estonian,English,Russian,Finnish,Hungarian,Latvian,German,Swedish,Norwegian,French".split(",")),
+    'mid': set("Komi,Komi-Zyrian,Northern Sami,Meadow Mari".split(",")),
+    'low': set("Udmurt,Proper Karelian,Southern Sami,Livvi,Veps,Moksha,Erzya,Lule Sami,Võro,Hill Mari,"
+               "Komi-Permyak,Inari Sami".split(",")),
+    'xlow': set("Ludian,Livonian,Izhorian,Votic,Shur Khanty,Skolt Sami,Meänkieli,"
+                "Sred Khanty,Surgut Khanty,Priur Khanty,Vakh Khanty,Unk Khanty,"
+                "Pite Sami,Mansi,Kazym Khanty,Kven,Ume Sami,Kildin Sami".split(","))
+}
+def _gen_lang(lang):
+    return lang.split(",")[0]
+def _hi_or_lo_lang(lang):
+    gen_lang = _gen_lang(lang)
+    for k, v in SMUGRI_RES.items():
+        if gen_lang in v:
+            return k
+    log(f"Unrecognized language: {lang} / {gen_lang}")
+    return '?'
+def _collect_lp_pairs(json_inputs, str_outputs):
+    sets_by_lp = defaultdict(list)
+    for i, o in zip(json_inputs, str_outputs):
+        ref = i["tgt_segm"]
+        hyp = o
+        det_lp = 'detailed: ' + i["src_lang"] + " -> " + i["tgt_lang"]
+        gen_lp = 'general: ' + _gen_lang(i["src_lang"]) + " -> " + _gen_lang(i["tgt_lang"])
+        hilo_lp = 'classes: ' + _hi_or_lo_lang(i["src_lang"]) + " -> " + _hi_or_lo_lang(i["tgt_lang"])
+        sets_by_lp[det_lp].append((hyp, ref))
+        sets_by_lp[gen_lp].append((hyp, ref))
+        sets_by_lp[hilo_lp].append((hyp, ref))
+    return sets_by_lp
+def compute_metrics(json_inputs, str_outputs):
+    sets_by_lp = _collect_lp_pairs(json_inputs, str_outputs)
+    metric = load_metric("chrf")
+    result = []
+    for lp in sets_by_lp:
+        preds, outputs = zip(*sets_by_lp[lp])
+        metric_value = metric.compute(predictions=preds, references=outputs)
+        result.append((lp, metric_value, len(preds)))
+    return result
+def avoid_global_scope():
+    json_inputs = read_input(sys.argv[1], "json")
+    str_outputs = read_input(sys.argv[2], "json")
+    lp_metric_dict = compute_metrics(json_inputs, str_outputs)
+    for lp, metric, size in lp_metric_dict:
+        print(f"{lp}: {metric['score']:.2f} ({size})")
+if __name__ == "__main__":
+    avoid_global_scope()

kuidastaltsutadalaamat/promptops.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# first, keyword identifiers for selecting prompt templates in scripts:
+PF_RAW = "raw"
+PF_RAWLINES = "rawlines"
+PF_SMUGRI_MT = "smugri_mt"
+PF_SMUGRI_LID = "smugri_lid"
+PF_ALPACA = "alpaca"
+# now the prompt templates themselves, SMUGRI LID / MT template:
+SMUGRI_INF_PROMPT_LID = "<|reserved_special_token_12|>{src_segm}<|reserved_special_token_13|>"
+_SMUGRI_INF_PROMPT_TMPMID = "<|reserved_special_token_14|>{task} to {tgt_lang}<|reserved_special_token_15|>"
+SMUGRI_INF_PROMPT_MT = SMUGRI_INF_PROMPT_LID + "{src_lang}" + _SMUGRI_INF_PROMPT_TMPMID
+_SMUGRI_TRAIN_PROMPT_PREF = SMUGRI_INF_PROMPT_LID + "{src_lang}"
+_SMUGRI_TRAIN_PROMPT_MID = _SMUGRI_INF_PROMPT_TMPMID + "{tgt_segm}"
+_SMUGRI_TRAIN_PROMPT_SUF = "<|reserved_special_token_16|><|end_of_text|>"
+SMUGRI_PROMPT_TRAIN_PARA = _SMUGRI_TRAIN_PROMPT_PREF + _SMUGRI_TRAIN_PROMPT_MID + _SMUGRI_TRAIN_PROMPT_SUF
+SMUGRI_PROMPT_TRAIN_MONO = _SMUGRI_TRAIN_PROMPT_PREF + _SMUGRI_TRAIN_PROMPT_SUF
+# Alpaca instructions prompt template:
+ALPACA_PROMPT_INF = ("Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n")
+ALPACA_PROMPT_TRAIN = (ALPACA_PROMPT_INF + "{output}")
+def prep_prompt(data, prompt_format, inference=False):
+    if prompt_format in {PF_RAW, PF_RAWLINES}:
+        # data is a string, return it
+        return data
+    elif prompt_format in {PF_SMUGRI_MT, PF_SMUGRI_LID}:
+        # data has src_segm, src_lang, tgt_lang, etc
+        return _prep_ljmf_entry(data, prompt_format, inference)
+    elif prompt_format == PF_ALPACA:
+        # data has instruction and input in it
+        return _prep_alpaca_entry(data, inference)
+    else:
+        raise NotImplementedError(f"Prompt format {prompt_format} is not implemented.")
+def _prep_alpaca_entry(entry, inference=False):
+    fmt = ALPACA_PROMPT_INF if inference else ALPACA_PROMPT_TRAIN
+    prompt = fmt.format(**entry)
+    return prompt
+def _prep_ljmf_entry(entry, fmt, inference=False):
+    if inference:
+        if fmt == PF_SMUGRI_MT:
+            prompt = SMUGRI_INF_PROMPT_MT.format(**entry)
+        elif fmt == PF_SMUGRI_LID:
+            prompt = SMUGRI_INF_PROMPT_LID.format(**entry)
+        else:
+            raise NotImplementedError(f"Prompt format {fmt} is not implemented.")
+    else:
+        if entry['task'] in {'translate', 'approx-translate'} and entry['tgt_segm'] and entry['tgt_lang']:
+            prompt = SMUGRI_PROMPT_TRAIN_PARA.format(**entry)
+        else:
+            prompt = SMUGRI_PROMPT_TRAIN_MONO.format(**entry)
+    return prompt

kuidastaltsutadalaamat/trainllm.py ADDED Viewed

	@@ -0,0 +1,252 @@

+#!/usr/bin/env python3
+from .promptops import PF_SMUGRI_MT
+from .aux import log, CmdlineArgs
+from .data import load_training_data
+import json
+import os, socket, torch
+from datetime import datetime
+from accelerate import Accelerator
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling,
+    logging,
+    TrainerCallback
+)
+"""
+1/3 This simply reads in command-line arguments
+"""
+def _cmdline_args():
+    description = """Train or tune decoder models"""
+    result = CmdlineArgs(description,
+                         pos_arg_list=["mdl_id", "save_location", "train_file"],
+                         pos_arg_types=[str, str, str],
+                         kw_arg_dict={ "continue_training": False, "save_steps": 100, "lr": 1.5e-5,
+                            "batch_size": 1024, "nr_sents_per_gpu": 4, "log_steps": 1, "epochs": 4,
+                            "max_length": 2000, "prompt_format": PF_SMUGRI_MT,
+                            "deepspeed": "none"})
+    # if the directory args.save_location already exists, raise an exception:
+    if not result.continue_training and os.path.exists(result.save_location):
+        raise Exception(f"Save location '{result.save_location}' already exists, don't want to overwrite.")
+    if result.nr_sents_per_gpu == 0:
+        result.nr_sents_per_gpu = result.batch_size
+    if result.deepspeed == "none":
+        result.deepspeed = None
+    return result
+"""
+2/3 This here is used in training in order to report timing and predictions
+"""
+class StepTimerCallback(TrainerCallback):
+    def __init__(self):
+        self._step_start = None
+        self.lengths = []
+        self.abs_start = datetime.now()
+        self.actual_first_step = None
+        self.zero = self.abs_start - self.abs_start
+    def on_step_begin(self, args, state, control, **kwargs):
+        # called right before each training step
+        self._step_start = datetime.now()
+    def on_step_end(self, args, state, control, **kwargs):
+        if self.actual_first_step is None:
+            self.actual_first_step = state.global_step - 1
+        # called right after each training step
+        now = datetime.now()
+        elapsed = now - self._step_start
+        tot_elapsed = now - self.abs_start
+        self.lengths.append(elapsed)
+        avg = sum(self.lengths, start=self.zero) / len(self.lengths)
+        remaining = state.max_steps - self.actual_first_step - state.global_step
+        prediction = (tot_elapsed/(state.global_step - self.actual_first_step)) * remaining
+        # you can use logging.get_logger(...) instead of print
+        print(f"[step {state.global_step}/{state.max_steps}] took {elapsed}, avg {avg}; approx {prediction} remaining")
+"""
+3/3 Finally, the filling of TrainingArguments and the launching of Trainer:
+"""
+def get_training_args(cmdline_args, acc):
+    world_size = acc.num_processes
+    assert cmdline_args.batch_size % (cmdline_args.nr_sents_per_gpu * world_size) == 0, \
+        "Batch size must be divisible by the number of GPUs and nr of sents per GPU"
+    accum_steps = cmdline_args.batch_size // (cmdline_args.nr_sents_per_gpu * world_size)
+    log(f"Nr of processes (GPUs): {world_size}, per-device batch: {cmdline_args.nr_sents_per_gpu}, accum. steps: {accum_steps}")
+    if cmdline_args.deepspeed is not None:
+        with open(cmdline_args.deepspeed, "r") as f:
+            dpspd = json.load(f)
+            #correct the dictionary with current values, so that we wouldn't need to update the JSON every time
+            dpspd['train_batch_size'] = cmdline_args.batch_size
+            dpspd['train_micro_batch_size_per_gpu'] = cmdline_args.nr_sents_per_gpu
+            dpspd['gradient_accumulation_steps'] = accum_steps
+            log(f"Using deepspeed with config {dpspd}")
+    else:
+        dpspd = None
+    tr_args = TrainingArguments(
+        output_dir=cmdline_args.save_location,
+        per_device_train_batch_size=cmdline_args.nr_sents_per_gpu,
+        gradient_accumulation_steps=accum_steps,
+        num_train_epochs=cmdline_args.epochs,
+        save_steps=cmdline_args.save_steps,
+        save_total_limit=10,
+        logging_steps=cmdline_args.log_steps,
+        deepspeed=dpspd,
+        learning_rate=cmdline_args.lr,
+        save_strategy="epoch",
+        disable_tqdm=True,
+        report_to="none",
+        # Optional but often helpful on LUMI/ROCm if you enable it in your args:
+        bf16=True,
+        ddp_find_unused_parameters=False,
+        #dataloader_num_workers=1,
+        #group_by_length=True,
+        log_level="debug",
+        #gradient_checkpointing=True,
+        #dataloader_persistent_workers=True
+    )
+    return tr_args
+def load_model(mdl_id, device, accelerator=None, attention="flash_attention_2"):
+    log(f"Load model", accelerator=accelerator)
+    model = AutoModelForCausalLM.from_pretrained(mdl_id,
+                                                 low_cpu_mem_usage=False,
+                                                 torch_dtype=torch.bfloat16,
+                                                 attn_implementation=attention)
+    model.config.use_cache = False
+    model = model.to(device)
+    log(f"Model loaded on device: {model.device}.", accelerator=accelerator)
+    return model
+def load_tokenizer(mdl_id, accelerator=None):
+    log(f"Load tokenizer", accelerator=accelerator)
+    tokenizer = AutoTokenizer.from_pretrained(mdl_id)
+    # LLaMA 3.x: no pad token by default
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = "<|reserved_special_token_100|>"
+    return tokenizer
+def simple_train():
+    cmd_args = _cmdline_args()
+    acc = Accelerator()
+    device = acc.device # it seems that the accelerator loses/changes this info later
+    training_args = get_training_args(cmd_args, acc)
+    tokenizer = load_tokenizer(cmd_args.mdl_id, acc)
+    model = load_model(cmd_args.mdl_id, device, acc)
+    if getattr(model.config, "pad_token_id", None) is None:
+        model.config.pad_token_id = tokenizer.pad_token_id
+    log(f"Load data", accelerator=acc)
+    tokenized_train_data = load_training_data(cmd_args.train_file, tokenizer, cmd_args)
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,
+        pad_to_multiple_of=8,  # GPT says this helps performance
+    )
+    log(f"Preparing to train", accelerator=acc)
+    clbks = [StepTimerCallback] if acc.is_main_process else []
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_train_data,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=clbks,
+    )
+    logging.set_verbosity_debug()
+    log(f"Starting training", accelerator=acc)
+    trainer.train(resume_from_checkpoint=cmd_args.continue_training)
+    log(f"Done, saving model", accelerator=acc)
+    trainer.save_model()
+def env_stuff():
+    os.environ.setdefault("LOCAL_RANK", os.environ.get("SLURM_LOCALID", "---"))
+    os.environ.setdefault("RANK", os.environ.get("SLURM_PROCID", "0"))
+    os.environ.setdefault("WORLD_SIZE", os.environ.get("SLURM_NTASKS", "1"))
+    os.environ.setdefault("MASTER_ADDR", os.environ.get("SLURM_LAUNCH_NODE_IPADDR", "127.0.0.1"))
+    os.environ.setdefault("MASTER_PORT", "29500")  # pick an open port
+    # Optional: make sure each process selects its own GPU
+    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+    try:
+        log(
+            f"host={socket.gethostname()} "
+            f"RANK={os.environ['RANK']}/{os.environ['WORLD_SIZE']} "
+            f"LOCAL_RANK={os.environ['LOCAL_RANK']} "
+            f"HIP_VISIBLE_DEVICES={os.environ.get('HIP_VISIBLE_DEVICES')} "
+            f"ROCR_VISIBLE_DEVICES={os.environ.get('ROCR_VISIBLE_DEVICES')} "
+            f"cuda_count={torch.cuda.device_count()} curr_dev={torch.cuda.current_device()}"
+        )
+    except AssertionError:
+        log(
+            f"host={socket.gethostname()} "
+            f"RANK={os.environ['RANK']}/{os.environ['WORLD_SIZE']} "
+            f"LOCAL_RANK={os.environ['LOCAL_RANK']} "
+            f"HIP_VISIBLE_DEVICES={os.environ.get('HIP_VISIBLE_DEVICES')} "
+            f"ROCR_VISIBLE_DEVICES={os.environ.get('ROCR_VISIBLE_DEVICES')} "
+            f"no cuda"
+        )
+"""
+This replaces the trainer, in order to
+print out the final batch when training,
+and commit harakiri. So only for temporary
+debugging-related usage
+"""
+class LoggingKillingTrainer(Trainer):
+    def compute_loss(self, model, inputs, **kwargs):
+        log(f"Here is the batch for training: {inputs}")
+        raise NotImplementedError
+        #return super().compute_loss(model, inputs, **kwargs)
+if __name__ == "__main__":
+    env_stuff()
+    simple_train()