#================================================================
# https://huggingface.co/spaces/asigalov61/Lyrics-Morpher
#================================================================

print('*' * 70)
print('Loading Lyrics Morpher modules...')

import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

import time
import datetime
from pytz import timezone

import re

import gradio as gr
import spaces

from transformers import AutoModelForCausalLM, AutoTokenizer

from typing import List, Tuple

print('*' * 70)
print('Done!')
print('*' * 70)

#==========================================================================================================

print('*' * 70)
print('Loading model and tokenizer...')
print('*' * 70)

model_name = "asigalov61/Lyrics_Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

print('*' * 70)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

print('*' * 70)
print('Done!')
print('*' * 70)

#==========================================================================================================

def _split_into_blocks(lines: List[str]) -> Tuple[int, List[Tuple[List[str], int]]]:
    """
    Splits `lines` into:
      - leading_blanks: number of blank lines before the first non-blank
      - blocks: a list of (block_lines, blank_count_after) where
          * block_lines is a list of consecutive non-blank lines
          * blank_count_after is how many blank lines follow that block
    """
    i = 0
    n = len(lines)
    # count leading blank lines
    leading_blanks = 0
    while i < n and lines[i] == "":
        leading_blanks += 1
        i += 1

    blocks = []
    while i < n:
        # collect non-blank lines
        block = []
        while i < n and lines[i] != "":
            block.append(lines[i])
            i += 1
        # then count blank lines after this block
        blank_after = 0
        while i < n and lines[i] == "":
            blank_after += 1
            i += 1
        blocks.append((block, blank_after))

    return leading_blanks, blocks

def _compare_line(tpl_line: str, txt_line: str) -> Tuple[int,int]:
    """
    Compare two lines token-by-token.
    Returns (char_mismatches, word_mismatches).
    '@' in tpl_line → one uppercase letter [A–Z]
    '_' in tpl_line → one lowercase letter [a–z]
    all other chars must match exactly.
    """
    char_mis = 0
    word_mis = 0

    tpl_tokens = tpl_line.split(" ")
    txt_tokens = txt_line.split(" ")

    # difference in token count
    if len(tpl_tokens) != len(txt_tokens):
        word_mis += abs(len(tpl_tokens) - len(txt_tokens))
    # compare each pair
    for t_tok, x_tok in zip(tpl_tokens, txt_tokens):
        token_error = False
        L = min(len(t_tok), len(x_tok))
        # char-by-char
        for j in range(L):
            p, c = t_tok[j], x_tok[j]
            if p == "@":
                if not ("A" <= c <= "Z"):
                    char_mis += 1
                    token_error = True
            elif p == "_":
                if not ("a" <= c <= "z"):
                    char_mis += 1
                    token_error = True
            else:
                if p != c:
                    char_mis += 1
                    token_error = True
        # length difference
        if len(t_tok) != len(x_tok):
            char_mis += abs(len(t_tok) - len(x_tok))
            token_error = True
        if token_error:
            word_mis += 1

    return char_mis, word_mis

def count_mismatches(template: str, text: str) -> Tuple[int,int]:
    """
    Compares `template` vs `text` where:
      - '@' matches exactly one uppercase letter [A–Z]
      - '_' matches exactly one lowercase letter [a–z]
      - all other characters (spaces, punctuation, quotes, case) match exactly
      - each extra/missing newline counts as 1 char & 1 word mismatch,
        but does NOT shift subsequent line alignment.
    Returns (char_mismatches, word_mismatches).
    """
    tpl_lines = template.splitlines()
    txt_lines = text.splitlines()

    # Split into blank‐line‐aware blocks
    tpl_lead, tpl_blocks = _split_into_blocks(tpl_lines)
    txt_lead, txt_blocks = _split_into_blocks(txt_lines)

    char_mis = 0
    word_mis = 0

    # 1) Leading blank line diff
    diff_lead = abs(tpl_lead - txt_lead)
    char_mis += diff_lead
    word_mis += diff_lead

    # 2) Compare block by block
    max_blocks = max(len(tpl_blocks), len(txt_blocks))
    for i in range(max_blocks):
        # unpack or empty
        if i < len(tpl_blocks):
            tpl_block, tpl_blank_after = tpl_blocks[i]
        else:
            tpl_block, tpl_blank_after = [], 0
        if i < len(txt_blocks):
            txt_block, txt_blank_after = txt_blocks[i]
        else:
            txt_block, txt_blank_after = [], 0

        # a) compare lines in this block
        max_lines = max(len(tpl_block), len(txt_block))
        for ln in range(max_lines):
            if ln < len(tpl_block) and ln < len(txt_block):
                c1, w1 = _compare_line(tpl_block[ln], txt_block[ln])
                char_mis += c1
                word_mis += w1
            elif ln < len(tpl_block):
                # missing line in text
                word_mis += 1
                # count all chars + one '\n'
                char_mis += len(tpl_block[ln]) + 1
            else:
                # extra line in text
                word_mis += 1
                char_mis += len(txt_block[ln]) + 1

        # b) blank‐line diff after block
        diff_blank = abs(tpl_blank_after - txt_blank_after)
        char_mis += diff_blank
        word_mis += diff_blank

    return char_mis, word_mis

#==========================================================================================================

def get_lyrics_template(song_title, song_lyrics):

    if song_title:
        title = re.sub(r'\s+', ' ', re.sub(r'[^A-Za-z ]+', '', song_title.strip())).strip()

    else:
        title = 'Unknown Song'

    lines = [re.sub(r'\s+', ' ', re.sub(r'[^A-Za-z ]+', '', l.strip())).strip() if l else '\n' for l in song_lyrics.split('\n')]

    src = ''
    
    words = []
    
    for a in title.split():

        wor = ''

        for aa in a:
            if aa.isupper():
                wor += '@'

            else:
                wor += '_'

        words.append(wor)

    title_str = ' '.join(words)
            
    src += 'Song title: "' + title_str + '"\n\n'

    src += 'Song lyrics:\n\n'

    for ln in lines:
        if ln != '\n':

            words = ln.split()
    
            for w in words:
    
                src += ''.join(['@' if a.isupper() else '_' for a in w]) + ' '
    
            src = src.strip()

        src += '\n'

    return src

#==========================================================================================================

@spaces.GPU
def Morph_Lyrics(input_title, input_lyrics):
    
    print('*' * 70)
    print('Req start time: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now(PDT)))
    start_time = time.time()
    
    print('=' * 70)
    print('Requested settings:')
    print('=' * 70)
    print('Input title:', input_title)
    print('Input lyrics:\n\n')
    print(input_lyrics)
    print('=' * 70)
    print('Processing lyrics...Please wait...')
    
    lyrics_template = get_lyrics_template(input_title, input_lyrics)
    
    print('Done!')
    print('=' * 70)
    print('Processing...Please wait...')


    messages = [
                {"role": "system", "content": "Please fill in the words in the following song lyrics template. Thank you."},
                {"role": "user", "content": lyrics_template}
               ]

    chat_text = tokenizer.apply_chat_template(messages,
                                              tokenize=False,
                                              add_generation_prompt=True
                                             )

    model_inputs = tokenizer([chat_text], return_tensors="pt").to(model.device)

    print('Done!')
    print('=' * 70)
    print('Generating...')

    num_batches = 256

    generated_ids = model.generate(**model_inputs,
                                   max_new_tokens=4096,
                                   do_sample=True,
                                   temperature=0.7,
                                   top_p=0.8,
                                   num_return_sequences=num_batches
                                  )

    print('Done!')
    print('=' * 70)

    print('Post-processing...')

    output_tokens = [output_ids[len(input_ids):] for input_ids, output_ids in zip([model_inputs.input_ids] * num_batches, generated_ids)]

    responses = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)

    final_responses = []
    
    for r in responses:
        final_responses.append(r.split('\nassistant\n')[-1])

    print('Done!')
    print('=' * 70)

    print('Selecting best response...')

    best_chars = 8192
    best_words = 8192
    
    best_response = ''
    
    for fr in final_responses:
        chars, words = count_mismatches(lyrics_template, fr)
    
        if chars < best_chars:
            best_chars = chars
            best_words = words
            best_response = fr

    print('Done!')
    print('=' * 70)

    print("Character mismatches:", best_chars)
    print("Word mismatches:     ", best_words)

    output_stats = ''

    output_stats += 'Character mismatches: ' + str(best_chars) + '\n'
    output_stats += 'Word mismatches: ' + str(best_words)

    output_lyrics = best_response

    #========================================================

    print('Req end time: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now(PDT)))
    print('-' * 70)
    print('Req execution time:', (time.time() - start_time), 'sec')
    print('*' * 70)
    
    #========================================================
    
    return output_stats, output_lyrics
    
#==========================================================================================================

if __name__ == "__main__":

    PDT = timezone('US/Pacific')
    
    print('=' * 70)
    print('App start time: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now(PDT)))
    print('=' * 70)
    
    app = gr.Blocks()
    
    with app:
        
        gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Lyrics Morpher</h1>")
        gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Morph any lyrics into an exact variations with fine-tuned Qwen2.5 0.5B Instruct model</h1>")

        input_title = gr.Textbox(label="Enter song title here", value="Nothing Else Matters")
        input_lyrics = gr.Textbox(label="Enter song lyrics here", value="So close no matter how far\nCould not be much more from the heart\nForever trusting who we are\nAnd nothing else matters")

        submit = gr.Button("Morph", variant="primary")

        gr.Markdown("## Morphing results")
        
        output_stats = gr.Textbox(label="Morphed lyrics stats")
        output_lyrics = gr.Textbox(label="Morphed lyrics")
        
        run_event = submit.click(Morph_Lyrics, 
                                 [input_title,
                                  input_lyrics                                 
                                 ],
                                 [output_stats,
                                  output_lyrics
                                 ])
        
    app.queue().launch()