|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from timeit import default_timer as timer |
|
import gc |
|
import os |
|
import random |
|
import logging |
|
import datasets |
|
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets |
|
from sentence_transformers import ( |
|
SentenceTransformer, |
|
SentenceTransformerTrainer, |
|
SentenceTransformerTrainingArguments, |
|
SentenceTransformerModelCardData, |
|
SimilarityFunction, |
|
) |
|
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss |
|
from sentence_transformers.training_args import BatchSamplers, MultiDatasetBatchSamplers |
|
from sentence_transformers.models.StaticEmbedding import StaticEmbedding |
|
from sentence_transformers.util import paraphrase_mining |
|
from sentence_transformers.evaluation import NanoBEIREvaluator |
|
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
import tensorflow as tf |
|
import torch |
|
|
|
|
|
version = '1' |
|
sts_basename = 'sts-mrl-en-de-base' |
|
|
|
|
|
|
|
|
|
|
|
tokenizer_model = 'dbmdz/bert-base-german-uncased' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO |
|
) |
|
random.seed(12) |
|
|
|
def load_train_eval_datasets(): |
|
""" |
|
Either load the train and eval datasets from disk or load them from the datasets library & save them to disk. |
|
|
|
Upon saving to disk, we quit() to ensure that the datasets are not loaded into memory before training. |
|
|
|
The order of sets here is not the same as later on in the full training/eval-sets!!! |
|
""" |
|
try: |
|
train_dataset = DatasetDict.load_from_disk("base_datasets/train_dataset") |
|
eval_dataset = DatasetDict.load_from_disk("base_datasets/eval_dataset") |
|
return train_dataset, eval_dataset |
|
except FileNotFoundError: |
|
print("No prepared dataset found. Building ...") |
|
|
|
|
|
|
|
print("Loading mMARCO-distilled-de-hn dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mmarco_de_3hn_ds = load_dataset('parquet', data_files={'mmarco-de-distilled_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
mmarco_de_3hn_ds = mmarco_de_3hn_ds.train_test_split(test_size=0.02, seed=12) |
|
mmarco_de_3hn_train_dataset: Dataset = mmarco_de_3hn_ds["train"] |
|
mmarco_de_3hn_eval_dataset: Dataset = mmarco_de_3hn_ds["test"] |
|
|
|
mmarco_de_2hn_ds = load_dataset('parquet', data_files={'mmarco-de-distilled_3hn/2_hard_negatives/*.parquet'}, split="train") |
|
mmarco_de_2hn_ds = mmarco_de_2hn_ds.train_test_split(test_size=0.02, seed=12) |
|
mmarco_de_2hn_train_dataset: Dataset = mmarco_de_2hn_ds["train"] |
|
mmarco_de_2hn_eval_dataset: Dataset = mmarco_de_2hn_ds["test"] |
|
|
|
mmarco_de_1hn_ds = load_dataset('parquet', data_files={'mmarco-de-distilled_3hn/1_hard_negatives/*.parquet'}, split="train") |
|
mmarco_de_1hn_ds = mmarco_de_1hn_ds.train_test_split(test_size=0.02, seed=12) |
|
mmarco_de_1hn_train_dataset: Dataset = mmarco_de_1hn_ds["train"] |
|
mmarco_de_1hn_eval_dataset: Dataset = mmarco_de_1hn_ds["test"] |
|
|
|
mmarco_de_0hn_ds = load_dataset('parquet', data_files={'mmarco-de-distilled_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
mmarco_de_0hn_ds = mmarco_de_0hn_ds.train_test_split(test_size=0.02, seed=12) |
|
mmarco_de_0hn_train_dataset: Dataset = mmarco_de_0hn_ds["train"] |
|
mmarco_de_0hn_eval_dataset: Dataset = mmarco_de_0hn_ds["test"] |
|
print("Loaded mMARCO-distilled-de-hn dataset.") |
|
|
|
print("Loading local prepared wikipedia-22-12-de datasets...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
name_local = 'wikipedia-22-12-de-scored' |
|
wp_2212_de_ds = DatasetDict.load_from_disk(f'{name_local}/{name_local}.hf') |
|
wp_2212_de_train_dataset: Dataset = wp_2212_de_ds["train"].select_columns(['question', 'context']) |
|
wp_2212_de_eval_dataset: Dataset = wp_2212_de_ds["test"].select_columns(['question', 'context']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loaded prepared full wikipedia-22-12-de dataset...") |
|
|
|
print("Loading swim-ir-monolingual-de-scored dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
swim_ir_de_ds = load_dataset('parquet', data_files={'swim-ir-monolingual-de_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
swim_ir_de_ds = swim_ir_de_ds.train_test_split(test_size=0.02, seed=12) |
|
swim_ir_de_train_dataset: Dataset = swim_ir_de_ds["train"] |
|
swim_ir_de_eval_dataset: Dataset = swim_ir_de_ds["test"] |
|
swim_ir_de_3hn_ds = load_dataset('parquet', data_files={'swim-ir-monolingual-de_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
swim_ir_de_3hn_ds = swim_ir_de_3hn_ds.train_test_split(test_size=0.02, seed=12) |
|
swim_ir_de_3hn_train_dataset: Dataset = swim_ir_de_3hn_ds["train"] |
|
swim_ir_de_3hn_eval_dataset: Dataset = swim_ir_de_3hn_ds["test"] |
|
|
|
swim_ir_de_title_ds = load_dataset('parquet', data_files={'swim-ir-monolingual-titles-de_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
swim_ir_de_title_3hn_ds = load_dataset('parquet', data_files={'swim-ir-monolingual-titles-de_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
swim_ir_de_title_ds = swim_ir_de_title_ds.train_test_split(test_size=0.02, seed=12) |
|
swim_ir_de_title_3hn_ds = swim_ir_de_title_3hn_ds.train_test_split(test_size=0.02, seed=12) |
|
swim_ir_de_title_train_dataset: Dataset = swim_ir_de_title_ds['train'] |
|
swim_ir_de_title_eval_dataset: Dataset = swim_ir_de_title_ds["test"] |
|
swim_ir_de_title_3hn_train_dataset: Dataset = swim_ir_de_title_3hn_ds['train'] |
|
swim_ir_de_title_3hn_eval_dataset: Dataset = swim_ir_de_title_3hn_ds['test'] |
|
print("Loaded swim-ir-monolingual-de-scored dataset.") |
|
|
|
print("Loading avemio_triples dataset...") |
|
|
|
|
|
|
|
avemio_triples_dataset = load_dataset("avemio/German-RAG-EMBEDDING-TRIPLES-HESSIAN-AI", split="train") |
|
avemio_triples_dataset_dict = avemio_triples_dataset.train_test_split(test_size=10000, seed=12) |
|
avemio_triples_train_dataset: Dataset = avemio_triples_dataset_dict["train"] |
|
avemio_triples_eval_dataset: Dataset = avemio_triples_dataset_dict["test"] |
|
print("Loaded avemio_triples dataset.") |
|
|
|
print("Loading avemio_pairs-hn dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
avemio_pairs_3hn_ds = load_dataset('parquet', data_files={'German-RAG-EMBEDDING-PAIRS-HESSIAN-AI-3hn-350_3hn/3_hard_negatives/*.parquet', 'German-RAG-EMBEDDING-PAIRS-HESSIAN-AI-3hn-600_3hn/3_hard_negatives/*.parquet', 'German-RAG-EMBEDDING-PAIRS-HESSIAN-AI-3hn-600plus_3hn/3_hard_negatives/*.parquet',}, split="train") |
|
avemio_pairs_3hn_ds = avemio_pairs_3hn_ds.train_test_split(test_size=10000, seed=12) |
|
avemio_pairs_3hn_train_ds: Dataset = avemio_pairs_3hn_ds["train"] |
|
avemio_pairs_3hn_eval_ds: Dataset = avemio_pairs_3hn_ds["test"] |
|
del avemio_pairs_3hn_ds |
|
|
|
avemio_pairs_0hn_ds = load_dataset('parquet', data_files={'German-RAG-EMBEDDING-PAIRS-HESSIAN-AI-3hn-350_3hn/0_hard_negatives/*.parquet', 'German-RAG-EMBEDDING-PAIRS-HESSIAN-AI-3hn-600_3hn/0_hard_negatives/*.parquet', 'German-RAG-EMBEDDING-PAIRS-HESSIAN-AI-3hn-600plus_3hn/0_hard_negatives/*.parquet',}, split="train") |
|
avemio_pairs_0hn_ds = avemio_pairs_0hn_ds.train_test_split(test_size=10000, seed=12) |
|
avemio_pairs_0hn_train_ds: Dataset = avemio_pairs_0hn_ds["train"] |
|
avemio_pairs_0hn_eval_ds: Dataset = avemio_pairs_0hn_ds["test"] |
|
del avemio_pairs_0hn_ds |
|
print("Loaded avemio_pairs-hn dataset.") |
|
|
|
print("Loading nq_german-hn dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nq_german_en_de_a_3hn_ds = load_dataset('parquet', data_files={'natural-questions-german-en_de-a-sts_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
nq_german_en_de_a_3hn_ds = nq_german_en_de_a_3hn_ds.train_test_split(test_size=0.02, seed=12) |
|
nq_german_en_de_a_3hn_train_ds: Dataset = nq_german_en_de_a_3hn_ds['train'] |
|
nq_german_en_de_a_3hn_eval_ds: Dataset = nq_german_en_de_a_3hn_ds['test'] |
|
|
|
nq_german_en_de_3hn_ds = load_dataset('parquet', data_files={'natural-questions-german-en_de-sts_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
nq_german_en_de_3hn_ds = nq_german_en_de_3hn_ds.train_test_split(test_size=0.02, seed=12) |
|
nq_german_en_de_3hn_train_ds: Dataset = nq_german_en_de_3hn_ds['train'] |
|
nq_german_en_de_3hn_eval_ds: Dataset = nq_german_en_de_3hn_ds['test'] |
|
|
|
nq_german_3hn_ds = load_dataset('parquet', data_files={'natural-questions-german-sts_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
nq_german_3hn_ds = nq_german_3hn_ds.train_test_split(test_size=0.02, seed=12) |
|
nq_german_3hn_train_ds: Dataset = nq_german_3hn_ds['train'] |
|
nq_german_3hn_eval_ds: Dataset = nq_german_3hn_ds['test'] |
|
|
|
nq_german_1hn_ds = load_dataset('parquet', data_files={'natural-questions-german-sts_3hn/1_hard_negatives/*.parquet'}, split="train") |
|
nq_german_1hn_ds = nq_german_1hn_ds.train_test_split(test_size=0.02, seed=12) |
|
nq_german_1hn_train_ds: Dataset = nq_german_1hn_ds['train'] |
|
nq_german_1hn_eval_ds: Dataset = nq_german_1hn_ds['test'] |
|
print("Loaded nq_german-hn dataset.") |
|
|
|
print("Loading german-oasst1-qa-format-scored dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
name_local = 'german-oasst1-qa-format-hn' |
|
german_oasst1_hn_train_dataset: Dataset = load_dataset('parquet', data_files={f'{name_local}/3_hard_negatives/train-*.parquet'}, split="train") |
|
german_oasst1_hn_eval_dataset: Dataset = load_dataset('parquet', data_files={f'{name_local}/3_hard_negatives/test-*.parquet'}, split="train") |
|
print("Loaded german-oasst1-qa-format-scored dataset.") |
|
|
|
print("Loading germanrag-scored dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def list_to_string(_): |
|
_['contexts'] = ' '.join(_['contexts']) |
|
return _ |
|
germanrag_short = load_dataset("MarcGrumpyOlejak/germanrag-scored", split='train').filter(lambda _: _['score_sts'] >= 0.16 and _['score_sts'] < 0.98 and _['positive_ctx_idx'] != -1) |
|
germanrag_context = germanrag_short.select_columns(['answer', 'contexts']) |
|
germanrag_context = germanrag_context.map(list_to_string) |
|
germanrag_context = germanrag_context.rename_columns({'answer': 'sentence1', 'contexts': 'sentence2'}) |
|
germanrag_short = germanrag_short.select_columns(['question', 'answer']) |
|
germanrag_short = germanrag_short.rename_columns({'question': 'sentence1', 'answer': 'sentence2'}) |
|
germanrag_short = concatenate_datasets([germanrag_short, germanrag_context]) |
|
germanrag_short = germanrag_short.train_test_split(test_size=0.02, seed=12) |
|
germanrag_short_train_dataset: Dataset = germanrag_short["train"] |
|
germanrag_short_eval_dataset: Dataset = germanrag_short["test"] |
|
print("Loaded germanrag dataset.") |
|
|
|
print("Loading slimorca_dedup_german_experimental-scored dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
slimorca_dedup_3hn_ds = load_dataset('parquet', data_files={'slimorca_dedup_german_experimental-sts-negatives_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
slimorca_dedup_3hn_ds = slimorca_dedup_3hn_ds.train_test_split(test_size=0.02, seed=12) |
|
slimorca_dedup_3hn_train_ds: Dataset = slimorca_dedup_3hn_ds['train'] |
|
slimorca_dedup_3hn_eval_ds: Dataset = slimorca_dedup_3hn_ds['test'] |
|
|
|
slimorca_dedup_2hn_ds = load_dataset('parquet', data_files={'slimorca_dedup_german_experimental-sts-negatives_3hn/2_hard_negatives/*.parquet'}, split="train") |
|
slimorca_dedup_2hn_ds = slimorca_dedup_2hn_ds.train_test_split(test_size=0.02, seed=12) |
|
slimorca_dedup_2hn_train_ds: Dataset = slimorca_dedup_2hn_ds['train'] |
|
slimorca_dedup_2hn_eval_ds: Dataset = slimorca_dedup_2hn_ds['test'] |
|
|
|
slimorca_dedup_1hn_ds = load_dataset('parquet', data_files={'slimorca_dedup_german_experimental-sts-negatives_3hn/1_hard_negatives/*.parquet'}, split="train") |
|
slimorca_dedup_1hn_ds = slimorca_dedup_1hn_ds.train_test_split(test_size=0.02, seed=12) |
|
slimorca_dedup_1hn_train_ds: Dataset = slimorca_dedup_1hn_ds['train'] |
|
slimorca_dedup_1hn_eval_ds: Dataset = slimorca_dedup_1hn_ds['test'] |
|
|
|
slimorca_dedup_0hn_ds = load_dataset('parquet', data_files={'slimorca_dedup_german_experimental-sts-negatives_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
slimorca_dedup_0hn_ds = slimorca_dedup_0hn_ds.train_test_split(test_size=0.02, seed=12) |
|
slimorca_dedup_0hn_train_ds: Dataset = slimorca_dedup_0hn_ds['train'] |
|
slimorca_dedup_0hn_eval_ds: Dataset = slimorca_dedup_0hn_ds['test'] |
|
print("Loaded slimorca_dedup_german_experimental-scored dataset.") |
|
|
|
print("Loading gpt-4-self-instruct-german-scored dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
name_local = 'gpt-4-self-instruct-german-hn' |
|
german_gpt4 = load_dataset('parquet', data_files={f'{name_local}/3_hard_negatives/train-*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
german_gpt4_3hn_train_dataset: Dataset = german_gpt4["train"] |
|
german_gpt4_3hn_eval_dataset: Dataset = german_gpt4["test"] |
|
print("Loaded GPT-4-Self-Instruct-German dataset.") |
|
|
|
print("Loading ultradistil-intel-orca-dpo-de-scored dataset...") |
|
|
|
|
|
|
|
|
|
|
|
german_orca_dpo_ds = load_dataset("MarcGrumpyOlejak/ultradistil-intel-orca-dpo-de-scored").filter(lambda _: _['score_sts'] >= 0.16 and _['score_sts'] < 0.98) |
|
german_orca_dpo_ds = german_orca_dpo_ds.select_columns(['input', 'chosen', 'rejected']) |
|
german_orca_dpo_ds = german_orca_dpo_ds['train'].train_test_split(test_size=0.02, seed=12) |
|
german_orca_dpo_train_dataset: Dataset = german_orca_dpo_ds["train"] |
|
german_orca_dpo_eval_dataset: Dataset = german_orca_dpo_ds["test"] |
|
print("Loaded ultradistil-intel-orca-dpo-de-scored dataset.") |
|
|
|
|
|
print("Loading alpaca-gpt4_de-scored dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
alpaca_gpt4_de_3hn_ds = load_dataset('parquet', data_files={'alpaca-gpt4_de_3hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
alpaca_gpt4_de_3hn_train_dataset: Dataset = alpaca_gpt4_de_3hn_ds['train'] |
|
alpaca_gpt4_de_3hn_eval_dataset: Dataset = alpaca_gpt4_de_3hn_ds['test'] |
|
alpaca_gpt4_de_0hn_ds = load_dataset('parquet', data_files={'alpaca-gpt4_de_3hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
alpaca_gpt4_de_0hn_train_dataset: Dataset = alpaca_gpt4_de_0hn_ds['train'] |
|
alpaca_gpt4_de_0hn_eval_dataset: Dataset = alpaca_gpt4_de_0hn_ds['test'] |
|
print("Loaded alpaca-gpt4_de dataset.") |
|
|
|
print("Loading DOLLY-15k (en-de) dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dolly_context_de_3hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/context-de-hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_context_de_3hn_train_ds: Dataset = dolly_context_de_3hn_ds['train'] |
|
dolly_context_de_3hn_eval_ds: Dataset = dolly_context_de_3hn_ds['test'] |
|
dolly_context_de_0hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/context-de-hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_context_de_0hn_train_ds: Dataset = dolly_context_de_0hn_ds['train'] |
|
dolly_context_de_0hn_eval_ds: Dataset = dolly_context_de_0hn_ds['test'] |
|
dolly_context_ende_3hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/context-en_de-hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_context_ende_3hn_train_ds: Dataset = dolly_context_ende_3hn_ds['train'] |
|
dolly_context_ende_3hn_eval_ds: Dataset = dolly_context_ende_3hn_ds['test'] |
|
|
|
|
|
dolly_instructions_de_3hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/instructions-de-hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_instructions_de_3hn_train_ds: Dataset = dolly_instructions_de_3hn_ds['train'] |
|
dolly_instructions_de_3hn_eval_ds: Dataset = dolly_instructions_de_3hn_ds['test'] |
|
dolly_instructions_de_0hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/instructions-de-hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_instructions_de_0hn_train_ds: Dataset = dolly_instructions_de_0hn_ds['train'] |
|
dolly_instructions_de_0hn_eval_ds: Dataset = dolly_instructions_de_0hn_ds['test'] |
|
dolly_instructions_ende_3hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/instructions-en_de-hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_instructions_ende_3hn_train_ds: Dataset = dolly_instructions_ende_3hn_ds['train'] |
|
dolly_instructions_ende_3hn_eval_ds: Dataset = dolly_instructions_ende_3hn_ds['test'] |
|
dolly_instructions_ende_0hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/instructions-en_de-hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_instructions_ende_0hn_train_ds: Dataset = dolly_instructions_ende_0hn_ds['train'] |
|
dolly_instructions_ende_0hn_eval_ds: Dataset = dolly_instructions_ende_0hn_ds['test'] |
|
dolly_responses_de_3hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/response-de-hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_responses_de_3hn_train_ds: Dataset = dolly_responses_de_3hn_ds['train'] |
|
dolly_responses_de_3hn_eval_ds: Dataset = dolly_responses_de_3hn_ds['test'] |
|
dolly_responses_de_0hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/response-de-hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_responses_de_0hn_train_ds: Dataset = dolly_responses_de_0hn_ds['train'] |
|
dolly_responses_de_0hn_eval_ds: Dataset = dolly_responses_de_0hn_ds['test'] |
|
dolly_responses_ende_3hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/response-en_de-hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_responses_ende_3hn_train_ds: Dataset = dolly_responses_ende_3hn_ds['train'] |
|
dolly_responses_ende_3hn_eval_ds: Dataset = dolly_responses_ende_3hn_ds['test'] |
|
dolly_responses_ende_0hn_ds = load_dataset('parquet', data_files={'databricks-dolly-15k-curated-de/response-en_de-hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
dolly_responses_ende_0hn_train_ds: Dataset = dolly_responses_ende_0hn_ds['train'] |
|
dolly_responses_ende_0hn_eval_ds: Dataset = dolly_responses_ende_0hn_ds['test'] |
|
print("Loaded DOLLY-15k (en-de) dataset.") |
|
|
|
print("Loading 'saf-legal_domain_german' dataset...") |
|
|
|
|
|
|
|
|
|
saf_legal_de_train = load_dataset("Short-Answer-Feedback/saf_legal_domain_german", split="train").filter(lambda _: _['score'] >= 0.75) |
|
saf_legal_de_qa_train = saf_legal_de_train.select_columns(['question', 'provided_answer']).rename_columns({'question': 'sentence1', 'provided_answer': 'sentence2'}) |
|
saf_legal_de_a_train = saf_legal_de_train.select_columns(['provided_answer', 'reference_answer']).rename_columns({'provided_answer': 'sentence1', 'reference_answer': 'sentence2'}) |
|
saf_legal_de_train_ds: Dataset = concatenate_datasets([saf_legal_de_qa_train, saf_legal_de_a_train]) |
|
|
|
saf_legal_de_eval = load_dataset("Short-Answer-Feedback/saf_legal_domain_german", split="validation").filter(lambda _: _['score'] >= 0.75) |
|
saf_legal_de_qa_eval = saf_legal_de_eval.select_columns(['question', 'provided_answer']).rename_columns({'question': 'sentence1', 'provided_answer': 'sentence2'}) |
|
saf_legal_de_a_eval = saf_legal_de_eval.select_columns(['provided_answer', 'reference_answer']).rename_columns({'provided_answer': 'sentence1', 'reference_answer': 'sentence2'}) |
|
saf_legal_de_eval_ds: Dataset = concatenate_datasets([saf_legal_de_qa_eval, saf_legal_de_a_eval]) |
|
print("Loaded 'saf-legal_domain_german' dataset.") |
|
|
|
print("Loading GLS dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gls_3hn = load_dataset('parquet', data_files={'german_legal_sentences_dist_3hn/3_hard_negatives/*.parquet'})['train'].train_test_split(test_size=0.02, seed=12) |
|
gls_3hn_train_dataset: Dataset = gls_3hn['train'] |
|
gls_3hn_eval_dataset: Dataset = gls_3hn['test'] |
|
gls_2hn = load_dataset('parquet', data_files={'german_legal_sentences_dist_3hn/2_hard_negatives/*.parquet'})['train'].train_test_split(test_size=0.02, seed=12) |
|
gls_2hn_train_dataset: Dataset = gls_2hn['train'] |
|
gls_2hn_eval_dataset: Dataset = gls_2hn['test'] |
|
gls_1hn = load_dataset('parquet', data_files={'german_legal_sentences_dist_3hn/1_hard_negatives/*.parquet'})['train'].train_test_split(test_size=0.02, seed=12) |
|
gls_1hn_train_dataset: Dataset = gls_1hn['train'] |
|
gls_1hn_eval_dataset: Dataset = gls_1hn['test'] |
|
gls_0hn = load_dataset('parquet', data_files={'german_legal_sentences_dist_3hn/0_hard_negatives/*.parquet'})['train'].train_test_split(test_size=0.02, seed=12) |
|
gls_0hn_train_dataset: Dataset = gls_0hn['train'] |
|
gls_0hn_eval_dataset: Dataset = gls_0hn['test'] |
|
print("Loaded GLS dataset.") |
|
|
|
print("Loading europarl EN-DE dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
europarl_dataset_3hn = load_dataset('parquet', data_files={'parallel-sentences-europarl-redux_3hn/3_hard_negatives/*.parquet'})['train'].train_test_split(test_size=10000, seed=12) |
|
europarl_3hn_train_dataset: Dataset = europarl_dataset_3hn["train"] |
|
europarl_3hn_eval_dataset: Dataset = europarl_dataset_3hn["test"] |
|
|
|
europarl_dataset_0hn = load_dataset('parquet', data_files={'parallel-sentences-europarl-redux_3hn/0_hard_negatives/*.parquet'})['train'].train_test_split(test_size=0.02, seed=12) |
|
europarl_0hn_train_dataset: Dataset = europarl_dataset_0hn["train"] |
|
europarl_0hn_eval_dataset: Dataset = europarl_dataset_0hn["test"] |
|
print("Loaded europarl EN-DE dataset.") |
|
|
|
print("Loading tatoeba EN-DE dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tatoeba_dataset_3hn = load_dataset('parquet', data_files={'parallel-sentences-tatoeba-en-de-hn/3_hard_negatives/*.parquet'})['train'].train_test_split(test_size=10000, seed=12) |
|
tatoeba_3hn_train_dataset: Dataset = tatoeba_dataset_3hn["train"] |
|
tatoeba_3hn_eval_dataset: Dataset = tatoeba_dataset_3hn["test"] |
|
|
|
tatoeba_dataset_0hn = load_dataset('parquet', data_files={'parallel-sentences-tatoeba-en-de-hn/0_hard_negatives/*.parquet'})['train'].train_test_split(test_size=0.02, seed=12) |
|
tatoeba_0hn_train_dataset: Dataset = tatoeba_dataset_0hn["train"] |
|
tatoeba_0hn_eval_dataset: Dataset = tatoeba_dataset_0hn["test"] |
|
print("Loaded tatoeba EN-DE dataset.") |
|
|
|
print("Loading WikiMatrix EN-DE dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wikimatrix_3hn_ds = load_dataset('parquet', data_files={'parallel-sentences-wikimatrix-hn_3hn/3_hard_negatives/train-*.parquet'}, split='train') |
|
wikimatrix_3hn_ds = wikimatrix_3hn_ds.train_test_split(test_size=10000, seed=12) |
|
wikimatrix_3hn_train_ds: Dataset = wikimatrix_3hn_ds["train"] |
|
wikimatrix_3hn_eval_ds: Dataset = wikimatrix_3hn_ds["test"] |
|
|
|
wikimatrix_0hn_ds = load_dataset('parquet', data_files={'parallel-sentences-wikimatrix-hn_3hn/0_hard_negatives/train-*.parquet'}, split='train') |
|
wikimatrix_0hn_ds = wikimatrix_0hn_ds.train_test_split(test_size=0.02, seed=12) |
|
wikimatrix_0hn_train_ds: Dataset = wikimatrix_0hn_ds["train"] |
|
wikimatrix_0hn_eval_ds: Dataset = wikimatrix_0hn_ds["test"] |
|
|
|
print("Loaded WikiMatrix EN-DE dataset.") |
|
|
|
print("Loading Wikipedia-Abstract DE dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wikipedia_abstract_3hn_ds = load_dataset('parquet', data_files={'Wikipedia-Abstract-distilled_3hn/3_hard_negatives/train-*.parquet'}, split='train') |
|
wikipedia_abstract_3hn_ds = wikipedia_abstract_3hn_ds.train_test_split(test_size=10000, seed=12) |
|
wikipedia_abstract_3hn_train_dataset: Dataset = wikipedia_abstract_3hn_ds["train"] |
|
wikipedia_abstract_3hn_eval_dataset: Dataset = wikipedia_abstract_3hn_ds["test"] |
|
|
|
wikipedia_abstract_0hn_ds = load_dataset('parquet', data_files={'Wikipedia-Abstract-distilled_3hn/0_hard_negatives/train-*.parquet'}, split='train') |
|
wikipedia_abstract_0hn_ds = wikipedia_abstract_0hn_ds.train_test_split(test_size=0.02, seed=12) |
|
wikipedia_abstract_0hn_train_dataset: Dataset = wikipedia_abstract_0hn_ds["train"] |
|
wikipedia_abstract_0hn_eval_dataset: Dataset = wikipedia_abstract_0hn_ds["test"] |
|
print("Loaded Wikipedia-Abstract DE dataset.") |
|
|
|
print("Loading wiktionary GDG-D DE dataset...") |
|
|
|
|
|
|
|
|
|
|
|
wiktionary_gdg_de_3hn_train_ds: Dataset = load_dataset('parquet', data_files={'GermanDefinitionGeneration-Distillation_3hn/3_hard_negatives/train-*.parquet'}, split='train') |
|
wiktionary_gdg_de_3hn_eval_ds: Dataset = load_dataset('parquet', data_files={'GermanDefinitionGeneration-Distillation_3hn/3_hard_negatives/validation-*.parquet'}, split='train') |
|
|
|
|
|
wiktionary_gdg_de_short_ds = load_dataset("jfeil/GermanDefinitionGeneration-Distillation") |
|
wiktionary_gdg_de_short_ds = wiktionary_gdg_de_short_ds.select_columns(['context_sentence', 'title']) |
|
wiktionary_gdg_de_short_train_dataset: Dataset = wiktionary_gdg_de_short_ds["train"] |
|
wiktionary_gdg_de_short_eval_dataset: Dataset = wiktionary_gdg_de_short_ds["test"] |
|
print("Loaded GDG-D DE dataset.") |
|
|
|
print("Loading wmt24pp dataset...") |
|
|
|
|
|
|
|
|
|
wmt24pp_dataset = load_dataset("google/wmt24pp", "en-de_DE", split="train").filter(lambda _: _["is_bad_source"] == False) |
|
wmt24pp_dataset = wmt24pp_dataset.select_columns(['source', 'target']) |
|
wmt24pp_dataset_dict = wmt24pp_dataset.train_test_split(test_size=0.02, seed=12) |
|
wmt24pp_train_dataset: Dataset = wmt24pp_dataset_dict["train"] |
|
wmt24pp_eval_dataset: Dataset = wmt24pp_dataset_dict["test"] |
|
print("Loaded wmt24pp dataset.") |
|
|
|
print("Loading synthia_german_experimental dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
synthia_de_ds = load_dataset("jphme/synthia_german_experimental", split="train").filter(lambda _: _["score_deutsch"] == 3 and _["score_antwort"] == 3) |
|
synthia_de_ds = synthia_de_ds.select_columns(["instruction", "response"]) |
|
synthia_de_ds = synthia_de_ds.train_test_split(test_size=0.02, seed=12) |
|
synthia_de_train_dataset: Dataset = synthia_de_ds["train"] |
|
synthia_de_eval_dataset: Dataset = synthia_de_ds["test"] |
|
print("Loaded synthia_german_experimental dataset.") |
|
|
|
print("Loading ger-backtrans-paraphrase dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gbp_3hn_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-350c-sts_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
gbp_3hn_add_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-200c-sts_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
gbp_3hn_ds = concatenate_datasets([gbp_3hn_ds, gbp_3hn_add_ds]) |
|
gbp_3hn_ds = gbp_3hn_ds.train_test_split(test_size=0.02, seed=12) |
|
gbp_3hn_train_ds: Dataset = gbp_3hn_ds['train'] |
|
gbp_3hn_eval_ds: Dataset = gbp_3hn_ds['test'] |
|
|
|
gbp_0hn_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-350c-sts_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
gbp_0hn_add_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-200c-sts_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
gbp_0hn_ds = concatenate_datasets([gbp_0hn_ds, gbp_0hn_add_ds]) |
|
gbp_0hn_add_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-150c-sts_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
gbp_0hn_ds = concatenate_datasets([gbp_0hn_ds, gbp_0hn_add_ds]) |
|
gbp_0hn_ds = gbp_0hn_ds.train_test_split(test_size=0.02, seed=12) |
|
gbp_0hn_train_ds: Dataset = gbp_0hn_ds['train'] |
|
gbp_0hn_eval_ds: Dataset = gbp_0hn_ds['test'] |
|
|
|
gbp_ende_3hn_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-en_de-350c-sts_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
gbp_ende_3hn_add_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-en_de-200c-sts_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
gbp_ende_3hn_ds = concatenate_datasets([gbp_ende_3hn_ds, gbp_ende_3hn_add_ds]) |
|
gbp_ende_3hn_add_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-en_de-150c-sts_3hn/3_hard_negatives/*.parquet'}, split="train") |
|
gbp_ende_3hn_ds = concatenate_datasets([gbp_ende_3hn_ds, gbp_ende_3hn_add_ds]) |
|
gbp_ende_3hn_ds = gbp_ende_3hn_ds.train_test_split(test_size=0.02, seed=12) |
|
gbp_ende_3hn_train_ds: Dataset = gbp_ende_3hn_ds['train'] |
|
gbp_ende_3hn_eval_ds: Dataset = gbp_ende_3hn_ds['test'] |
|
|
|
gbp_ende_0hn_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-en_de-350c-sts_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
gbp_ende_0hn_add_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-en_de-200c-sts_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
gbp_ende_0hn_ds = concatenate_datasets([gbp_ende_0hn_ds, gbp_ende_0hn_add_ds]) |
|
gbp_ende_0hn_add_ds = load_dataset('parquet', data_files={'ger-backtrans-paraphrase-en_de-150c-sts_3hn/0_hard_negatives/*.parquet'}, split="train") |
|
gbp_ende_0hn_ds = concatenate_datasets([gbp_ende_0hn_ds, gbp_ende_0hn_add_ds]) |
|
gbp_ende_0hn_ds = gbp_ende_0hn_ds.train_test_split(test_size=0.02, seed=12) |
|
gbp_ende_0hn_train_ds: Dataset = gbp_ende_0hn_ds['train'] |
|
gbp_ende_0hn_eval_ds: Dataset = gbp_ende_0hn_ds['test'] |
|
print("Loaded ger-backtrans-paraphrase dataset.") |
|
|
|
print("Loading STSb Multi MT (de) dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stbs_de_3hn_train_dataset = load_dataset('parquet', data_files={'stsb_multi_mt-de-hn/3_hard_negatives/train*.parquet'}, split="train") |
|
stbs_de_3hn_eval_dataset = load_dataset('parquet', data_files={'stsb_multi_mt-de-hn/3_hard_negatives/test*.parquet'}, split="train") |
|
print("Loaded STSb Multi MT (de) dataset.") |
|
|
|
print("Loading STSb Multi MT (en) dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stbs_en_3hn_train_dataset = load_dataset('parquet', data_files={'stsb_multi_mt-en-hn/3_hard_negatives/train*.parquet'}, split="train") |
|
stbs_en_3hn_eval_dataset = load_dataset('parquet', data_files={'stsb_multi_mt-en-hn/3_hard_negatives/test*.parquet'}, split="train") |
|
print("Loaded STSb Multi MT (en) dataset.") |
|
|
|
print("Loading paws-x (de) dataset...") |
|
|
|
|
|
|
|
|
|
|
|
pawsx_de_dataset = load_dataset("google-research-datasets/paws-x", "de").filter(lambda _: _["label"] == 1) |
|
pawsx_de_dataset = pawsx_de_dataset.select_columns(['sentence1', 'sentence2']) |
|
pawsx_de_train_dataset: Dataset = pawsx_de_dataset["train"] |
|
pawsx_de_eval_dataset: Dataset = pawsx_de_dataset["validation"] |
|
|
|
print("Loaded paws-x (de) dataset.") |
|
|
|
print("Loading paws-x (en) dataset...") |
|
|
|
|
|
|
|
pawsx_en_dataset = load_dataset("google-research-datasets/paws-x", "en").filter(lambda _: _["label"] == 1) |
|
pawsx_en_dataset = pawsx_en_dataset.select_columns(['sentence1', 'sentence2']) |
|
pawsx_en_train_dataset: Dataset = pawsx_en_dataset["train"] |
|
pawsx_en_eval_dataset: Dataset = pawsx_en_dataset["validation"] |
|
print("Loaded paws-x (en) dataset.") |
|
|
|
print("Loading all NLI-26lang-2mil7 (local) datasets...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
main_name = 'multilingual-NLI-26lang-2mil7' |
|
language = 'de' |
|
entail = 'de_entailment' |
|
transl = 'en_de' |
|
subset = 'anli' |
|
|
|
de_anli_entail_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_anli_entail_3hn_train_ds: Dataset = de_anli_entail_3hn_ds['train'] |
|
de_anli_entail_3hn_eval_ds: Dataset = de_anli_entail_3hn_ds['test'] |
|
|
|
de_anli_entail_0hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_anli_entail_0hn_train_ds: Dataset = de_anli_entail_0hn_ds['train'] |
|
de_anli_entail_0hn_eval_ds: Dataset = de_anli_entail_0hn_ds['test'] |
|
|
|
de_anli_transl_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{transl}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_anli_transl_3hn_train_ds: Dataset = de_anli_transl_3hn_ds['train'] |
|
de_anli_transl_3hn_eval_ds: Dataset = de_anli_transl_3hn_ds['test'] |
|
|
|
de_anli_transl_0hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{transl}_hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_anli_transl_0hn_train_ds: Dataset = de_anli_transl_0hn_ds['train'] |
|
de_anli_transl_0hn_eval_ds: Dataset = de_anli_transl_0hn_ds['test'] |
|
|
|
subset = 'fever' |
|
|
|
de_fever_entail_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_fever_entail_3hn_train_ds: Dataset = de_fever_entail_3hn_ds['train'] |
|
de_fever_entail_3hn_eval_ds: Dataset = de_fever_entail_3hn_ds['test'] |
|
|
|
de_fever_entail_0hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_fever_entail_0hn_train_ds: Dataset = de_fever_entail_0hn_ds['train'] |
|
de_fever_entail_0hn_eval_ds: Dataset = de_fever_entail_0hn_ds['test'] |
|
|
|
de_fever_transl_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{transl}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_fever_transl_3hn_train_ds: Dataset = de_fever_transl_3hn_ds['train'] |
|
de_fever_transl_3hn_eval_ds: Dataset = de_fever_transl_3hn_ds['test'] |
|
|
|
de_fever_transl_0hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{transl}_hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_fever_transl_0hn_train_ds: Dataset = de_fever_transl_0hn_ds['train'] |
|
de_fever_transl_0hn_eval_ds: Dataset = de_fever_transl_0hn_ds['test'] |
|
|
|
subset = 'ling' |
|
|
|
de_ling_entail_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_ling_entail_3hn_train_ds: Dataset = de_ling_entail_3hn_ds['train'] |
|
de_ling_entail_3hn_eval_ds: Dataset = de_ling_entail_3hn_ds['test'] |
|
|
|
de_ling_entail_0hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_ling_entail_0hn_train_ds: Dataset = de_ling_entail_0hn_ds['train'] |
|
de_ling_entail_0hn_eval_ds: Dataset = de_ling_entail_0hn_ds['test'] |
|
|
|
de_ling_transl_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{transl}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_ling_transl_3hn_train_ds: Dataset = de_ling_transl_3hn_ds['train'] |
|
de_ling_transl_3hn_eval_ds: Dataset = de_ling_transl_3hn_ds['test'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
subset = 'mnli' |
|
|
|
de_mnli_entail_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_mnli_entail_3hn_train_ds: Dataset = de_mnli_entail_3hn_ds['train'] |
|
de_mnli_entail_3hn_eval_ds: Dataset = de_mnli_entail_3hn_ds['test'] |
|
|
|
de_mnli_entail_0hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_mnli_entail_0hn_train_ds: Dataset = de_mnli_entail_0hn_ds['train'] |
|
de_mnli_entail_0hn_eval_ds: Dataset = de_mnli_entail_0hn_ds['test'] |
|
|
|
de_mnli_transl_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{transl}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_mnli_transl_3hn_train_ds: Dataset = de_mnli_transl_3hn_ds['train'] |
|
de_mnli_transl_3hn_eval_ds: Dataset = de_mnli_transl_3hn_ds['test'] |
|
|
|
de_mnli_transl_0hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{transl}_hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_mnli_transl_0hn_train_ds: Dataset = de_mnli_transl_0hn_ds['train'] |
|
de_mnli_transl_0hn_eval_ds: Dataset = de_mnli_transl_0hn_ds['test'] |
|
|
|
subset = 'wanli' |
|
|
|
de_wanli_entail_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_wanli_entail_3hn_train_ds: Dataset = de_wanli_entail_3hn_ds['train'] |
|
de_wanli_entail_3hn_eval_ds: Dataset = de_wanli_entail_3hn_ds['test'] |
|
|
|
de_wanli_entail_0hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{entail}_hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_wanli_entail_0hn_train_ds: Dataset = de_wanli_entail_0hn_ds['train'] |
|
de_wanli_entail_0hn_eval_ds: Dataset = de_wanli_entail_0hn_ds['test'] |
|
|
|
de_wanli_transl_3hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{transl}_hn/3_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_wanli_transl_3hn_train_ds: Dataset = de_wanli_transl_3hn_ds['train'] |
|
de_wanli_transl_3hn_eval_ds: Dataset = de_wanli_transl_3hn_ds['test'] |
|
|
|
de_wanli_transl_0hn_ds = load_dataset('parquet', data_files={f'{main_name}-{language}_{subset}-{transl}_hn/0_hard_negatives/*.parquet'}, split="train").train_test_split(test_size=0.02, seed=12) |
|
de_wanli_transl_0hn_train_ds: Dataset = de_wanli_transl_0hn_ds['train'] |
|
de_wanli_transl_0hn_eval_ds: Dataset = de_wanli_transl_0hn_ds['test'] |
|
|
|
print("Loaded all NLI-26lang-2mil7 (local hn) datasets...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loading Jina AI dataset...") |
|
|
|
|
|
|
|
|
|
|
|
jina_ai_ps_dataset = load_dataset("jinaai/parallel-sentences", split="train") |
|
jina_ai_ps_dataset_3en = jina_ai_ps_dataset.select_columns(['anchor', 'entailment', 'negative']) |
|
jina_ai_ps_dataset_en_de = jina_ai_ps_dataset.select_columns(['anchor', 'anchor_de']) |
|
jina_ai_ps_dataset_de_de = jina_ai_ps_dataset.select_columns(['anchor_de', 'entailment_de']) |
|
|
|
jina_ai_ps_dataset_3en_dict = jina_ai_ps_dataset_3en.train_test_split(test_size=0.05, seed=12) |
|
jina_ai_ps_dataset_en_de_dict = jina_ai_ps_dataset_en_de.train_test_split(test_size=0.05, seed=12) |
|
jina_ai_ps_dataset_de_de_dict = jina_ai_ps_dataset_de_de.train_test_split(test_size=0.05, seed=12) |
|
jina_ai_ps_train_3en: Dataset = jina_ai_ps_dataset_3en_dict["train"] |
|
jina_ai_ps_eval_3en: Dataset = jina_ai_ps_dataset_3en_dict["test"] |
|
jina_ai_ps_train_en_de: Dataset = jina_ai_ps_dataset_en_de_dict["train"] |
|
jina_ai_ps_eval_en_de: Dataset = jina_ai_ps_dataset_en_de_dict["test"] |
|
jina_ai_ps_train_de_de: Dataset = jina_ai_ps_dataset_de_de_dict["train"] |
|
jina_ai_ps_eval_de_de: Dataset = jina_ai_ps_dataset_de_de_dict["test"] |
|
print("Loaded Jina AI dataset.") |
|
|
|
print("Loading Polyglot-or-Not (de) dataset...") |
|
|
|
|
|
|
|
polyglot_de_dataset = load_dataset("Polyglot-or-Not/Fact-Completion", split="German").select_columns(['stem', 'true', 'false']) |
|
polyglot_de_dict = polyglot_de_dataset.train_test_split(test_size=0.05, seed=12) |
|
polyglot_de_train_dataset: Dataset = polyglot_de_dict["train"] |
|
polyglot_de_eval_dataset: Dataset = polyglot_de_dict["test"] |
|
print("Loaded Polyglot-or-Not (de) dataset.") |
|
|
|
print("Loading Polyglot-or-Not (en) dataset...") |
|
|
|
|
|
|
|
polyglot_en_dataset = load_dataset("Polyglot-or-Not/Fact-Completion", split="English").select_columns(['stem', 'true', 'false']) |
|
polyglot_en_dict = polyglot_en_dataset.train_test_split(test_size=0.05, seed=12) |
|
polyglot_en_train_dataset: Dataset = polyglot_en_dict["train"] |
|
polyglot_en_eval_dataset: Dataset = polyglot_en_dict["test"] |
|
print("Loaded Polyglot-or-Not (de) dataset.") |
|
|
|
print("Loading Tilde_MODEL_EESC (en_de) dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tilde_EESC_dataset = load_dataset("parquet", data_files={'Tilde_EESC-en-de_hn/3_hard_negatives/train-*.parquet'}, split='train') |
|
tilde_EESC_dataset = tilde_EESC_dataset.train_test_split(test_size=10000, seed=12) |
|
tilde_EESC_train_dataset: Dataset = tilde_EESC_dataset["train"] |
|
tilde_EESC_eval_dataset: Dataset = tilde_EESC_dataset["test"] |
|
del tilde_EESC_dataset |
|
|
|
print("Loaded Tilde_MODEL_EESC (en_de) dataset.") |
|
|
|
print("Loading Tilde_MODEL_RAPID (en_de) dataset...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tilde_RAPID_dataset = load_dataset("parquet", data_files={'Tilde_RAPID_2019-en-de-hn/3_hard_negatives/train-*.parquet'}, split='train') |
|
tilde_RAPID_dataset = tilde_RAPID_dataset.train_test_split(test_size=10000, seed=12) |
|
tilde_RAPID_train_dataset: Dataset = tilde_RAPID_dataset["train"] |
|
tilde_RAPID_eval_dataset: Dataset = tilde_RAPID_dataset["test"] |
|
del tilde_RAPID_dataset |
|
print("Loaded Tilde_MODEL_RAPID (en_de) dataset.") |
|
|
|
print("Loading miracl (de) as classification dataset...") |
|
miracl_de_dataset = load_dataset('parquet', data_files={'miracl-corpus-de-hn-*/3_hard_negatives/train-*.parquet'}, split='train') |
|
miracl_de_dataset = miracl_de_dataset.train_test_split(test_size=10000, seed=12) |
|
miracl_de_train_dataset: Dataset = miracl_de_dataset["train"] |
|
miracl_de_eval_dataset: Dataset = miracl_de_dataset["test"] |
|
|
|
miracl_de_0hn_dataset = load_dataset('parquet', data_files={'miracl-corpus-de-hn_hn/0_hard_negatives/train-*.parquet'}, split='train') |
|
miracl_de_0hn_dataset = miracl_de_0hn_dataset.train_test_split(test_size=0.02, seed=12) |
|
miracl_de_0hn_train_dataset: Dataset = miracl_de_0hn_dataset['train'] |
|
miracl_de_0hn_eval_dataset: Dataset = miracl_de_0hn_dataset['test'] |
|
print("Loaded miracl (de) as classification dataset.") |
|
|
|
train_dataset = DatasetDict({ |
|
'mmarco_3hn': mmarco_de_3hn_train_dataset, |
|
'mmarco_2hn': mmarco_de_2hn_train_dataset, |
|
'mmarco_1hn': mmarco_de_1hn_train_dataset, |
|
'mmarco_0hn': mmarco_de_0hn_train_dataset, |
|
'wp-22-12-de': wp_2212_de_train_dataset, |
|
|
|
|
|
'swim_ir_de': swim_ir_de_train_dataset, |
|
'swim_ir_de_3hn': swim_ir_de_3hn_train_dataset, |
|
'swim_ir_de_title_3hn': swim_ir_de_title_3hn_train_dataset, |
|
'swim_ir_de_title': swim_ir_de_title_train_dataset, |
|
'avemio_triples': avemio_triples_train_dataset, |
|
'avemio_pairs_3hn': avemio_pairs_3hn_train_ds, |
|
'avemio_pairs_0hn': avemio_pairs_0hn_train_ds, |
|
'nq_german_en_de_a_3hn': nq_german_en_de_a_3hn_train_ds, |
|
'nq_german_en_de_3hn': nq_german_en_de_3hn_train_ds, |
|
'nq_german_3hn': nq_german_3hn_train_ds, |
|
'nq_german_1hn': nq_german_1hn_train_ds, |
|
|
|
'german_oasst1_hn': german_oasst1_hn_train_dataset, |
|
'germanrag_short': germanrag_short_train_dataset, |
|
'slimorca_dedup_3hn': slimorca_dedup_3hn_train_ds, |
|
'slimorca_dedup_2hn': slimorca_dedup_2hn_train_ds, |
|
'slimorca_dedup_1hn': slimorca_dedup_1hn_train_ds, |
|
'slimorca_dedup_0hn': slimorca_dedup_0hn_train_ds, |
|
|
|
'german_gpt4_3hn': german_gpt4_3hn_train_dataset, |
|
'german_orca_dpo': german_orca_dpo_train_dataset, |
|
'alpaca_gpt4_3hn': alpaca_gpt4_de_3hn_train_dataset, |
|
'alpaca_gpt4_0hn': alpaca_gpt4_de_0hn_train_dataset, |
|
'dolly_context_de_3hn': dolly_context_de_3hn_train_ds, |
|
|
|
'dolly_context_ende_3hn': dolly_context_ende_3hn_train_ds, |
|
'dolly_instructions_de_3hn': dolly_instructions_de_3hn_train_ds, |
|
'dolly_instructions_de_0hn': dolly_instructions_de_0hn_train_ds, |
|
'dolly_instructions_ende_3hn': dolly_instructions_ende_3hn_train_ds, |
|
|
|
'dolly_responses_de_3hn': dolly_responses_de_3hn_train_ds, |
|
'dolly_responses_de_0hn': dolly_responses_de_0hn_train_ds, |
|
'dolly_responses_ende_3hn': dolly_responses_ende_3hn_train_ds, |
|
|
|
'saf_legal_de': saf_legal_de_train_ds, |
|
'gls_3hn': gls_3hn_train_dataset, |
|
'gls_2hn': gls_2hn_train_dataset, |
|
'gls_1hn': gls_1hn_train_dataset, |
|
'gls_0hn': gls_0hn_train_dataset, |
|
'europarl_3hn': europarl_3hn_train_dataset, |
|
'europarl_0hn': europarl_0hn_train_dataset, |
|
|
|
'tatoeba_3hn': tatoeba_3hn_train_dataset, |
|
'tatoeba_0hn': tatoeba_0hn_train_dataset, |
|
'wikimatrix_3hn': wikimatrix_3hn_train_ds, |
|
|
|
'wikipedia_abstract_3hn': wikipedia_abstract_3hn_train_dataset, |
|
'wikipedia_abstract_0hn': wikipedia_abstract_0hn_train_dataset, |
|
'wiktionary_gdg_de_3hn': wiktionary_gdg_de_3hn_train_ds, |
|
'wiktionary_gdg_de_short': wiktionary_gdg_de_short_train_dataset, |
|
'wmt24pp': wmt24pp_train_dataset, |
|
'synthia_de': synthia_de_train_dataset, |
|
'gbp_3hn': gbp_3hn_train_ds, |
|
|
|
'gbp_ende_3hn': gbp_ende_3hn_train_ds, |
|
|
|
|
|
'stbs_de_3hn': stbs_de_3hn_train_dataset, |
|
|
|
'stbs_en_3hn': stbs_en_3hn_train_dataset, |
|
'pawsx_de': pawsx_de_train_dataset, |
|
'pawsx_en': pawsx_en_train_dataset, |
|
'nli_anli_entail_3hn': de_anli_entail_3hn_train_ds, |
|
'nli_fever_entail_3hn': de_fever_entail_3hn_train_ds, |
|
'nli_ling_entail_3hn': de_ling_entail_3hn_train_ds, |
|
'nli_mnli_entail_3hn': de_mnli_entail_3hn_train_ds, |
|
'nli_wanli_entail_3hn': de_wanli_entail_3hn_train_ds, |
|
|
|
|
|
|
|
|
|
|
|
'nli_anli_transl_3hn': de_anli_transl_3hn_train_ds, |
|
'nli_fever_transl_3hn': de_fever_transl_3hn_train_ds, |
|
'nli_ling_transl_3hn': de_ling_transl_3hn_train_ds, |
|
'nli_mnli_transl_3hn': de_mnli_transl_3hn_train_ds, |
|
'nli_wanli_transl_3hn': de_wanli_transl_3hn_train_ds, |
|
|
|
|
|
|
|
|
|
|
|
'jina_ai_3en': jina_ai_ps_train_3en, |
|
'jina_ai_ende': jina_ai_ps_train_en_de, |
|
'jina_ai_dede': jina_ai_ps_train_de_de, |
|
'polyglot_de': polyglot_de_train_dataset, |
|
'polyglot_en': polyglot_en_train_dataset, |
|
'tilde_EESC': tilde_EESC_train_dataset, |
|
|
|
'miracl_de_3hn': miracl_de_train_dataset, |
|
'miracl_de_0hn': miracl_de_0hn_train_dataset, |
|
}) |
|
eval_dataset = DatasetDict({ |
|
'mmarco_3hn': mmarco_de_3hn_eval_dataset, |
|
'mmarco_2hn': mmarco_de_2hn_eval_dataset, |
|
'mmarco_1hn': mmarco_de_1hn_eval_dataset, |
|
'mmarco_0hn': mmarco_de_0hn_eval_dataset, |
|
'wp-22-12-de': wp_2212_de_eval_dataset, |
|
|
|
|
|
'swim_ir_de': swim_ir_de_eval_dataset, |
|
'swim_ir_de_3hn': swim_ir_de_3hn_eval_dataset, |
|
'swim_ir_de_title_3hn': swim_ir_de_title_3hn_eval_dataset, |
|
'swim_ir_de_title': swim_ir_de_title_eval_dataset, |
|
'avemio_triples': avemio_triples_eval_dataset, |
|
'avemio_pairs_3hn': avemio_pairs_3hn_eval_ds, |
|
'avemio_pairs_0hn': avemio_pairs_0hn_eval_ds, |
|
'nq_german_en_de_a_3hn': nq_german_en_de_a_3hn_eval_ds, |
|
'nq_german_en_de_3hn': nq_german_en_de_3hn_eval_ds, |
|
'nq_german_3hn': nq_german_3hn_eval_ds, |
|
'nq_german_1hn': nq_german_1hn_eval_ds, |
|
|
|
'german_oasst1_hn': german_oasst1_hn_eval_dataset, |
|
'germanrag_short': germanrag_short_eval_dataset, |
|
'slimorca_dedup_3hn': slimorca_dedup_3hn_eval_ds, |
|
'slimorca_dedup_2hn': slimorca_dedup_2hn_eval_ds, |
|
'slimorca_dedup_1hn': slimorca_dedup_1hn_eval_ds, |
|
'slimorca_dedup_0hn': slimorca_dedup_0hn_eval_ds, |
|
|
|
'german_gpt4_3hn': german_gpt4_3hn_eval_dataset, |
|
'german_orca_dpo': german_orca_dpo_eval_dataset, |
|
'alpaca_gpt4_3hn': alpaca_gpt4_de_3hn_eval_dataset, |
|
'alpaca_gpt4_0hn': alpaca_gpt4_de_0hn_eval_dataset, |
|
'dolly_context_de_3hn': dolly_context_de_3hn_eval_ds, |
|
|
|
'dolly_context_ende_3hn': dolly_context_ende_3hn_eval_ds, |
|
'dolly_instructions_de_3hn': dolly_instructions_de_3hn_eval_ds, |
|
'dolly_instructions_de_0hn': dolly_instructions_de_0hn_eval_ds, |
|
'dolly_instructions_ende_3hn': dolly_instructions_ende_3hn_eval_ds, |
|
|
|
'dolly_responses_de_3hn': dolly_responses_de_3hn_eval_ds, |
|
'dolly_responses_de_0hn': dolly_responses_de_0hn_eval_ds, |
|
'dolly_responses_ende_3hn': dolly_responses_ende_3hn_eval_ds, |
|
|
|
'saf_legal_de': saf_legal_de_eval_ds, |
|
'gls_3hn': gls_3hn_eval_dataset, |
|
'gls_2hn': gls_2hn_eval_dataset, |
|
'gls_1hn': gls_1hn_eval_dataset, |
|
'gls_0hn': gls_0hn_eval_dataset, |
|
'europarl_3hn': europarl_3hn_eval_dataset, |
|
'europarl_0hn': europarl_0hn_eval_dataset, |
|
|
|
'tatoeba_3hn': tatoeba_3hn_eval_dataset, |
|
'tatoeba_0hn': tatoeba_0hn_eval_dataset, |
|
'wikimatrix_3hn': wikimatrix_3hn_eval_ds, |
|
|
|
'wikipedia_abstract_3hn': wikipedia_abstract_3hn_eval_dataset, |
|
'wikipedia_abstract_0hn': wikipedia_abstract_0hn_eval_dataset, |
|
'wiktionary_gdg_de_3hn': wiktionary_gdg_de_3hn_eval_ds, |
|
'wiktionary_gdg_de_short': wiktionary_gdg_de_short_eval_dataset, |
|
'wmt24pp': wmt24pp_eval_dataset, |
|
'synthia_de': synthia_de_eval_dataset, |
|
'gbp_3hn': gbp_3hn_eval_ds, |
|
|
|
'gbp_ende_3hn': gbp_ende_3hn_eval_ds, |
|
|
|
|
|
'stbs_de_3hn': stbs_de_3hn_eval_dataset, |
|
|
|
'stbs_en_3hn': stbs_en_3hn_eval_dataset, |
|
'pawsx_de': pawsx_de_eval_dataset, |
|
'pawsx_en': pawsx_en_eval_dataset, |
|
'nli_anli_entail_3hn': de_anli_entail_3hn_eval_ds, |
|
'nli_fever_entail_3hn': de_fever_entail_3hn_eval_ds, |
|
'nli_ling_entail_3hn': de_ling_entail_3hn_eval_ds, |
|
'nli_mnli_entail_3hn': de_mnli_entail_3hn_eval_ds, |
|
'nli_wanli_entail_3hn': de_wanli_entail_3hn_eval_ds, |
|
|
|
|
|
|
|
|
|
|
|
'nli_anli_transl_3hn': de_anli_transl_3hn_eval_ds, |
|
'nli_fever_transl_3hn': de_fever_transl_3hn_eval_ds, |
|
'nli_ling_transl_3hn': de_ling_transl_3hn_eval_ds, |
|
'nli_mnli_transl_3hn': de_mnli_transl_3hn_eval_ds, |
|
'nli_wanli_transl_3hn': de_wanli_transl_3hn_eval_ds, |
|
|
|
|
|
|
|
|
|
|
|
'jina_ai_3en': jina_ai_ps_eval_3en, |
|
'jina_ai_ende': jina_ai_ps_eval_en_de, |
|
'jina_ai_dede': jina_ai_ps_eval_de_de, |
|
'polyglot_de': polyglot_de_eval_dataset, |
|
'polyglot_en': polyglot_en_eval_dataset, |
|
'tilde_EESC': tilde_EESC_eval_dataset, |
|
|
|
'miracl_de_3hn': miracl_de_eval_dataset, |
|
'miracl_de_0hn': miracl_de_0hn_eval_dataset, |
|
}) |
|
|
|
train_dataset.save_to_disk("base_datasets/train_dataset") |
|
eval_dataset.save_to_disk("base_datasets/eval_dataset") |
|
|
|
end_time = timer() |
|
print('Time for preprocessing (minutes): '+str(round((end_time - start_time)/60, 3))) |
|
|
|
|
|
quit() |
|
|
|
def main(): |
|
|
|
static_embedding = StaticEmbedding(AutoTokenizer.from_pretrained(f"{tokenizer_model}"), embedding_dim=2048) |
|
model = SentenceTransformer( |
|
modules=[static_embedding], |
|
model_card_data=SentenceTransformerModelCardData( |
|
language="de, en", |
|
license="eupl-1.2", |
|
model_name=f"A static embedding model tokenized with {tokenizer_model} and mainly built on DE/EN-datasets.", |
|
), |
|
) |
|
|
|
|
|
train_dataset, eval_dataset = load_train_eval_datasets() |
|
print(train_dataset) |
|
|
|
|
|
|
|
loss = MultipleNegativesRankingLoss(model) |
|
loss = MatryoshkaLoss(model, loss, matryoshka_dims=[32, 64, 128, 256, 512, 1024, 2048]) |
|
|
|
|
|
|
|
if len(tf.config.list_physical_devices('GPU')) > 0: |
|
fp16=True |
|
bf16=False |
|
else: |
|
fp16=False |
|
bf16=True |
|
|
|
|
|
|
|
run_name = f"{sts_basename}-v{version}" |
|
args = SentenceTransformerTrainingArguments( |
|
|
|
output_dir=f"models/{run_name}", |
|
|
|
num_train_epochs=1, |
|
per_device_train_batch_size=1024 * 4, |
|
per_device_eval_batch_size=1024 * 4, |
|
learning_rate=2e-1, |
|
lr_scheduler_type="cosine", |
|
warmup_ratio=0.1, |
|
fp16=fp16, |
|
bf16=bf16, |
|
batch_sampler=BatchSamplers.NO_DUPLICATES, |
|
multi_dataset_batch_sampler=MultiDatasetBatchSamplers.PROPORTIONAL, |
|
|
|
eval_strategy="steps", |
|
eval_steps=500, |
|
save_strategy="steps", |
|
save_steps=1000, |
|
save_total_limit=2, |
|
logging_steps=500, |
|
logging_first_step=True, |
|
run_name=run_name, |
|
) |
|
|
|
|
|
trainer = SentenceTransformerTrainer( |
|
model=model, |
|
args=args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset, |
|
loss=loss, |
|
) |
|
trainer.train() |
|
|
|
|
|
model.save_pretrained(f"models/{run_name}/final") |
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluator = NanoBEIREvaluator(show_progress_bar=True) |
|
results = evaluator(model) |
|
print('\n' + str(results[evaluator.primary_metric])) |
|
|
|
|
|
if __name__ == "__main__": |
|
start_time = timer() |
|
main() |
|
end_time = timer() |
|
print('Time for training (minutes): '+str(round((end_time - start_time)/60, 3))) |
|
|
|
|