Initial commit including model and configuration

Browse files

Files changed (7) hide show

__init__.py +0 -0
config.json +17 -0
configuration_stacked.py +32 -0
lang_ident.py +40 -0
modeling_stacked.py +159 -0
push_to_hf.py +181 -0
test.py +16 -0

__init__.py ADDED Viewed

File without changes

config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "auto_map": {
+    "AutoConfig": "configuration_stacked.ImpressoConfig"
+  },
+  "custom_pipelines": {
+    "lang-ident": {
+      "impl": "lang_ident.LangIdentPipeline",
+      "pt": [
+        "ExtendedMultitaskModelForTokenClassification"
+      ],
+      "tf": []
+    }
+  },
+  "filename": "LID-40-3-2000000-1-4.bin",
+  "model_type": "floret",
+  "transformers_version": "4.45.2"
+}

configuration_stacked.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from transformers import PretrainedConfig
+import os
+class ImpressoConfig(PretrainedConfig):
+    model_type = "floret"
+    def __init__(self, filename="LID-40-3-2000000-1-4.bin", **kwargs):
+        super().__init__(**kwargs)
+        self.filename = filename
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        # Bypass JSON loading and create config directly
+        print(f"Loading ImpressoConfig from {pretrained_model_name_or_path}")
+        print(os.getcwd())
+        config = cls(filename="LID-40-3-2000000-1-4.bin", **kwargs)
+        return config
+# Register the configuration with the transformers library
+ImpressoConfig.register_for_auto_class()
+# Register the custom pipeline
+# PIPELINE_REGISTRY.register_pipeline(
+#     task="lang-ident",
+#     pipeline_class=LangIdentPipeline,
+#     model=AutoModelForSequenceClassification,
+#     tokenizer=AutoTokenizer,
+# )
+#
+# print("Custom pipeline 'lang-ident' registered successfully.")

lang_ident.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from transformers import Pipeline
+class LangIdentPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "text" in kwargs:
+            preprocess_kwargs["text"] = kwargs["text"]
+        return preprocess_kwargs, {}, {}
+    def preprocess(self, text, **kwargs):
+        print("this is preprocessing:")
+        print(text)
+        return text
+    def _forward(self, text):
+        # Extract label and confidence
+        predictions, probabilities = self.model.predict([text], k=1)
+        label = predictions[0][0].replace("__label__", "")  # Remove __label__ prefix
+        confidence = float(
+            probabilities[0][0]
+        )  # Convert to float for JSON serialization
+        # Format as JSON-compatible dictionary
+        model_output = {"label": label, "confidence": round(confidence * 100, 2)}
+        print("Formatted Model Output:", model_output)
+        return model_output
+    def postprocess(self, outputs, **kwargs):
+        return outputs
+# PIPELINE_REGISTRY.register_pipeline(
+#     task="language-detection",
+#     pipeline_class=Pipeline_One,
+#     default={"model": None},
+# )

modeling_stacked.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from transformers import PreTrainedModel, AutoModel, AutoConfig, PretrainedConfig
+import floret, torch
+import os, shutil
+from configuration_stacked import ImpressoConfig
+from transformers.modeling_utils import (
+    get_parameter_device as original_get_parameter_device,
+)
+import torch
+# Import Hugging Face dependencies
+import transformers.modeling_utils
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_utils import (
+    get_parameter_device as original_get_parameter_device,
+)
+# Custom get_parameter_device
+def custom_get_parameter_device(module):
+    """
+    Custom get_parameter_device() to handle floret models.
+    Returns 'cpu' for FloretModelWrapper, otherwise uses the original implementation.
+    """
+    # Check if the model is an instance of your FloretModelWrapper
+    if isinstance(module, FloretModelWrapper):
+        print(
+            "Custom get_parameter_device(): Detected FloretModelWrapper. Returning 'cpu'."
+        )
+        return torch.device("cpu")
+    # Otherwise, fall back to Hugging Face's original implementation
+    return original_get_parameter_device(module)
+# Custom device property
+@property
+def custom_device(self) -> torch.device:
+    """
+    Custom device() method to handle floret models.
+    Always returns torch.device('cpu') for FloretModelWrapper.
+    """
+    # Check if the model is an instance of your FloretModelWrapper
+    if isinstance(self, FloretModelWrapper):
+        print(
+            "Custom device(): Detected FloretModelWrapper. Returning torch.device('cpu')."
+        )
+        return torch.device("cpu")
+    # Otherwise, fall back to Hugging Face's original implementation
+    return torch.device("cpu")  # original_device.__get__(self, type(self))
+# Monkey-patch get_parameter_device and device property
+transformers.modeling_utils.get_parameter_device = custom_get_parameter_device
+PreTrainedModel.device = custom_device
+print("Monkey-patch applied: get_parameter_device and device property")
+# logger = logging.getLogger(__name__)
+original_device = PreTrainedModel.device
+def get_info(label_map):
+    num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
+    return num_token_labels_dict
+class FloretModelWrapper:
+    """
+    Wrapper for floret model to make it compatible with Hugging Face pipeline.
+    Mocks the .device attribute and passes predict() unchanged.
+    """
+    def __init__(self, floret_model):
+        self.floret_model = floret_model
+        # Mocking the .device attribute to make Hugging Face happy
+        self.device = torch.device("cpu")  # floret is always on CPU
+    def predict(self, text, k=1):
+        """
+        Pass-through for floret's predict() method.
+        """
+        return self.floret_model.predict(text, k=k)
+class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
+    config_class = ImpressoConfig
+    # Monkey-patch get_parameter_device
+    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
+        super().__init__(config)
+        self.config = config
+        print("Doest is it even pass through here?")
+        print(
+            f"The config in ExtendedMultitaskModelForTokenClassification is: {self.config}"
+        )
+        # self.model = floret.load_model(self.config.filename)
+    def predict(self, text, k=1):
+        predictions = self.model.predict(text, k)
+        return predictions
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        print("Calling from_pretrained...")
+        # Initialize model with config
+        model = cls(ImpressoConfig())
+        # Load model using floret
+        print(f"---Loading model from: {model.config.filename}")
+        floret_model = floret.load_model(model.config.filename)
+        # Wrap the model to fake .device attribute
+        model.model = FloretModelWrapper(floret_model)
+        print(model.model, "device:", model.model.device)
+        print(f"Model loaded and wrapped from: {model.config.filename}")
+        return model
+    def save_pretrained(self, save_directory, *args, **kwargs):
+        # Ignore Hugging Face-specific arguments
+        max_shard_size = kwargs.pop("max_shard_size", None)
+        safe_serialization = kwargs.pop("safe_serialization", False)
+        # Ensure directory exists
+        os.makedirs(save_directory, exist_ok=True)
+        # Save the model file
+        model_file = os.path.join(save_directory, "LID-40-3-2000000-1-4.bin")
+        shutil.copy(self.config.filename, model_file)
+        # Save the config file
+        config_file = os.path.join(save_directory, "config.json")
+        self.config.save_pretrained(save_directory)
+        print(f"Model saved to: {save_directory}")
+    def get_parameter_device(module):
+        """
+        Custom get_parameter_device() to handle floret models.
+        Returns 'cpu' for floret models, and falls back to the original method otherwise.
+        """
+        # Check if the model is an instance of your FloretModelWrapper
+        if isinstance(module, FloretModelWrapper):
+            print(
+                "Custom get_parameter_device(): Detected FloretModelWrapper. Returning 'cpu'."
+            )
+            return "cpu"
+        # Otherwise, fall back to Hugging Face's original implementation
+        return original_get_parameter_device(module)

push_to_hf.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import os
+import shutil
+import argparse
+from transformers import (
+    AutoTokenizer,
+    AutoConfig,
+    AutoModelForSequenceClassification,
+)
+from huggingface_hub import HfApi, Repository
+from transformers.pipelines import PIPELINE_REGISTRY
+# import json
+from configuration_stacked import ImpressoConfig
+from modeling_stacked import ExtendedMultitaskModelForTokenClassification
+import subprocess
+from lang_ident import LangIdentPipeline
+def get_latest_checkpoint(checkpoint_dir):
+    checkpoints = [
+        d
+        for d in os.listdir(checkpoint_dir)
+        if os.path.isdir(os.path.join(checkpoint_dir, d))
+        and d.startswith("checkpoint-")
+    ]
+    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True)
+    return os.path.join(checkpoint_dir, checkpoints[0])
+def get_info(label_map):
+    num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
+    return num_token_labels_dict
+def push_model_to_hub(checkpoint_dir, repo_name):
+    # checkpoint_path = get_latest_checkpoint(checkpoint_dir)
+    checkpoint_path = checkpoint_dir
+    config = ImpressoConfig.from_pretrained(checkpoint_path)
+    print(config)
+    config.pretrained_config = ImpressoConfig.from_pretrained(config.filename)
+    config.save_pretrained("floret")
+    config = ImpressoConfig.from_pretrained("floret")
+    PIPELINE_REGISTRY.register_pipeline(
+        "lang-ident",
+        pipeline_class=LangIdentPipeline,
+        pt_model=ExtendedMultitaskModelForTokenClassification,
+    )
+    # PIPELINE_REGISTRY.register_pipeline(
+    #     "pair-classification",
+    #     pipeline_class=PairClassificationPipeline,
+    #     pt_model=AutoModelForSequenceClassification,
+    #     tf_model=TFAutoModelForSequenceClassification,
+    # )
+    config.custom_pipelines = {
+        "lang-ident": {
+            "impl": "lang_ident.LangIdentPipeline",
+            "pt": ["AutoModelForSequenceClassification"],
+            "tf": [],
+        }
+    }
+    model = ExtendedMultitaskModelForTokenClassification.from_pretrained(
+        checkpoint_path, config=config
+    )
+    local_repo_path = "lang-detect"
+    repo_url = HfApi().create_repo(repo_id=repo_name, exist_ok=True)
+    repo = Repository(local_dir=local_repo_path, clone_from=repo_url)
+    try:
+        # Try to pull the latest changes from the remote repository using subprocess
+        subprocess.run(["git", "pull"], check=True, cwd=local_repo_path)
+    except subprocess.CalledProcessError as e:
+        # If fast-forward is not possible, reset the local branch to match the remote branch
+        subprocess.run(
+            ["git", "reset", "--hard", "origin/main"],
+            check=True,
+            cwd=local_repo_path,
+        )
+    # Copy all Python files to the local repository directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    for filename in os.listdir(current_dir):
+        if filename.endswith(".py") or filename.endswith(".json"):
+            shutil.copy(
+                os.path.join(current_dir, filename),
+                os.path.join(local_repo_path, filename),
+            )
+    ImpressoConfig.register_for_auto_class()
+    AutoConfig.register("floret", ImpressoConfig)
+    AutoModelForSequenceClassification.register(
+        ImpressoConfig, ExtendedMultitaskModelForTokenClassification
+    )
+    ExtendedMultitaskModelForTokenClassification.register_for_auto_class(
+        "AutoModelForSequenceClassification"
+    )
+    # model.save_pretrained(local_repo_path)
+    from transformers import AutoModelForTokenClassification, AutoTokenizer
+    from transformers import pipeline
+    # Define the model name to be used for token classification, we use the Impresso NER
+    # that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
+    MODEL_NAME = "Maslionok/lang-detect"
+    #
+    # # Add, commit and push the changes to the repository
+    subprocess.run(["git", "add", "."], check=True, cwd=local_repo_path)
+    subprocess.run(
+        ["git", "commit", "-m", "Initial commit including model and configuration"],
+        check=True,
+        cwd=local_repo_path,
+    )
+    subprocess.run(["git", "push"], check=True, cwd=local_repo_path)
+    #
+    # Push the model to the hub (this includes the README template)
+    model.push_to_hub(repo_name)
+    lang_pipeline = pipeline(
+        "lang-ident", model=MODEL_NAME, trust_remote_code=True, device="cpu"
+    )
+    lang_pipeline.push_to_hub(MODEL_NAME)
+    sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
+    #
+    print(lang_pipeline(sentence))
+    # lang_pipeline.push_to_hub(MODEL_NAME)
+    print(f"Model and repo pushed to: {repo_url}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Push NER model to Hugging Face Hub")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        required=True,
+        help="Type of the model (e.g., langident)",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        required=True,
+        help="Language of the model (e.g., multilingual)",
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        required=True,
+        default="LID-40-3-2000000-1-4.bin",
+        help="Directory containing checkpoint folders",
+    )
+    args = parser.parse_args()
+    repo_name = f"Maslionok/lang-detect"
+    push_model_to_hub(args.checkpoint_dir, repo_name)
+    # PIPELINE_REGISTRY.register_pipeline(
+    #     "generic-ner",
+    #     pipeline_class=MultitaskTokenClassificationPipeline,
+    #     pt_model=ExtendedMultitaskModelForTokenClassification,
+    # )
+    # model.config.custom_pipelines = {
+    #     "generic-ner": {
+    #         "impl": "generic_ner.MultitaskTokenClassificationPipeline",
+    #         "pt": ["ExtendedMultitaskModelForTokenClassification"],
+    #         "tf": [],
+    #     }
+    # }
+    # classifier = pipeline(
+    #     "generic-ner", model=model, tokenizer=tokenizer, label_map=label_map
+    # )
+    # from pprint import pprint
+    #
+    # pprint(
+    #     classifier(
+    #         "1. Le public est averti que Charlotte née Bourgoin, femme-de Joseph Digiez, et Maurice Bourgoin, enfant mineur représenté par le sieur Jaques Charles Gicot son curateur, ont été admis par arrêt du Conseil d'Etat du 5 décembre 1797, à solliciter une renonciation générale et absolue aux biens et aux dettes présentes et futures de Jean-Baptiste Bourgoin leur père."
+    #     )
+    # )
+    # repo.push_to_hub(commit_message="Initial commit of the trained NER model with code")

test.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Import necessary Python modules from the Transformers library
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+from transformers import pipeline
+# Define the model name to be used for token classification, we use the Impresso NER
+# that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
+MODEL_NAME = "emanuelaboros/lang-detect"
+lang_pipeline = pipeline("lang-ident", model=MODEL_NAME,
+                        trust_remote_code=True,
+                        device='cpu')
+sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
+entities = lang_pipeline(sentence)
+print(entities)