import streamlit as st
import re
import string
import torch
from transformers import pipeline
from datasets import load_dataset

def img2text(url):
    image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", use_fast=True)
    text = image_to_text_model(url)[0]["generated_text"]
    return text

def text2story(text):
    story_generator = pipeline("text-generation", model="Qwen/Qwen2.5-1.5B", device_map="auto", return_full_text=False)
    prompt = f"Give me a story under 100 words based upon: {text}."
    generated = story_generator(prompt, max_new_tokens=140, do_sample=True)
    story_text = generated[0]['generated_text']

    # Remove leading punctuation if present
    story_text = re.sub(r'^\s*[.,!?;:]+\s*', '', story_text)
    story_text = story_text.lstrip()
    
    # Capitalize the first alphabetic character if it's lowercase
    for i, char in enumerate(story_text):
        if char.isalpha():
            if char.islower():
                story_text = story_text[:i] + char.upper() + story_text[i+1:]
            break
    
    # Split into sentences
    sentences = re.split(r'(?<=[.!?])\s+', story_text.strip())
    
    # Initialize variables
    current_word_count = 0
    final_sentences = []
    
    # Iterate through each sentence and accumulate until the word count is within 100
    for sentence in sentences:
        words = sentence.split()
        word_count = len(words)
        if current_word_count + word_count > 100:
            break
        final_sentences.append(sentence)
        current_word_count += word_count
    
    # Join the final sentences to form the story
    final_story = ' '.join(final_sentences)
    
    # Ensure it ends with a punctuation mark
    if not final_story.endswith(('.', '!', '?')):
        final_story += '.'
    
    return final_story

def text2audio(story_text):
    audio_generator = pipeline("text-to-speech", "microsoft/speecht5_tts")
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    audio_output = audio_generator(story_text, forward_params={"speaker_embeddings": speaker_embedding})
    return audio_output

def main():
    st.set_page_config(page_title="Once Upon A Time - Storytelling Application", page_icon="📖")
    st.header("Create a story of yours with an image!🧙")

    uploaded_file = st.file_uploader("Upload an image for creating your story!")

    if uploaded_file is not None:
        bytes_data = uploaded_file.getvalue()
        with open(uploaded_file.name, "wb") as file:
            file.write(bytes_data)

        st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)

        st.text('Entering the scene...🏰')
        scenario = img2text(uploaded_file.name)
        st.write(scenario)

        st.text('Your story is going to begin...🦄')
        story = text2story(scenario)
        st.write(story)

        st.text('Your story is going to be told...🎧')
        audio_data = text2audio(story)
        st.audio(audio_data['audio'], format="audio/wav", start_time=0, sample_rate=audio_data['sampling_rate'])

if __name__ == "__main__":
    main()