from datasets import load_dataset, get_dataset_config_names, interleave_datasets, load_dataset_builder
from .dataset import HFDiaIterDataset
import pandas as pd
from huggingface_hub import hf_hub_download


LANG_NAME_TO_CODE = {
    "dutch":      "nl",
    "french":     "fr",
    "german":     "de",
    "italian":    "it",
    "polish":     "pl",
    "portuguese": "pt",
    "spanish":    "es",
    # add more if other configs appear...
}


def load_cml_tts_streamed(dia_cfg, dac_model):
    """
    Stream all language subsets of the CML-TTS dataset in train split,
    add a `language` field, drop all except `text`, `audio`, `language`,
    and interleave them into one streaming Dataset.

    Returns:
        datasets.IterableDataset: interleaved streaming dataset
    """
    # 1) Discover all language subsets
    lang_configs = get_dataset_config_names("ylacombe/cml-tts")

    # 2) Build one streaming subset per language, with only desired columns
    streams = []
    num_ex=0
    for lang in lang_configs:
        
        iso_code = LANG_NAME_TO_CODE.get(lang, lang)
        ds_stream = load_dataset(
            "ylacombe/cml-tts",
            name=lang,
            split="train",
            streaming=True
        )

        num_ex += ds_stream.info.splits['train'].num_examples
        # keep only text, audio, and add language
        def _add_lang(ex, iso=iso_code):
            return {
                "text": ex["text"],
                "audio": ex["audio"],
                "language": iso
            }
        ds_stream = ds_stream.map(
            _add_lang,
            remove_columns=[c for c in ds_stream.column_names if c not in ["text", "audio", "language"]]
        )
        streams.append(ds_stream)

    # 3) Interleave all streams into one unified stream
    interleaved = interleave_datasets(streams, stopping_strategy="all_exhausted")
    ds = HFDiaIterDataset(interleaved, dia_cfg, dac_model)
    ds.total_examples = num_ex
    return ds


def count_tsv_rows(
    repo_id: str,
    subset: str,
    split: str = "train",
    revision: str = "main"
) -> int:
    """Download the TSV for a given subset/split and return its number of rows."""
    file_path = f"transcript/{subset}/{split}.tsv"
    try:
        local_file = hf_hub_download(
            repo_id=repo_id,
            filename=file_path,
            repo_type="dataset",
            revision=revision
        )
    except:
        print("error fetching tsv metadata")

    df = pd.read_csv(local_file, sep="\t", low_memory=False)
    return len(df)

def load_common_voice17_streamed(dia_cfg, dac_model, revision="main"):
    """
    Stream the train split of Common Voice 17 for the given language codes,
    rename `sentence`→`text`, keep only `text`, `audio`, and `language`,
    then interleave into a single streaming Dataset.

    Languages loaded: en, de, fr, es, it, nl, pl, pt, tr, hu
    """
    repo_id = "mozilla-foundation/common_voice_17_0"
    langs = ["en", "de", "fr", "es", "it", "nl", "pl", "pt", "tr", "hu"]

    streams = []
    row_counts = []

    for lang in langs:
        # 1) figure out how many rows in the TSV
        n_rows = count_tsv_rows(repo_id, lang, split="train", revision=revision)
        row_counts.append(n_rows)

        # 2) load in streaming mode
        ds_stream = load_dataset(
            repo_id,
            name=lang,
            split="train",
            streaming=True,
            revision=revision
        )

        # 3) map to desired schema
        def _prep(ex, iso=lang):
            return {
                "text": ex["sentence"],
                "audio": ex["audio"],
                "language": iso
            }

        ds_stream = ds_stream.map(
            _prep,
            remove_columns=[c for c in ds_stream.column_names if c not in ("sentence", "audio")]
        )
        streams.append(ds_stream)

    # 4) interleave: all_exhausted ⇒ max_length * num_streams
    interleaved = interleave_datasets(streams, stopping_strategy="all_exhausted")

    # 5) wrap and attach total_examples
    ds = HFDiaIterDataset(interleaved, dia_cfg, dac_model)
    ds.total_examples = max(row_counts) * len(langs)

    return ds