File size: 3,254 Bytes
60897a4
8a64db3
9c0ac9b
92c86ce
bbd9992
c868ac1
60897a4
 
f210a0f
6416df7
60897a4
 
 
b732924
5f4bf89
 
37b42cd
5e51a2a
5c5debf
6254442
 
9abd09b
9c0ac9b
 
 
 
 
 
 
5e51a2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6416df7
c868ac1
 
 
 
 
 
e0d7f16
5c5debf
 
 
60897a4
5c5debf
6416df7
5c5debf
 
 
 
45e82e6
5c5debf
6416df7
5c5debf
 
 
 
 
 
 
 
c868ac1
 
 
5c5debf
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import streamlit as st
import re
import string
import torch
from transformers import pipeline
from datasets import load_dataset

def img2text(url):
    image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", use_fast=True)
    text = image_to_text_model(url)[0]["generated_text"]
    return text

def text2story(text):
    story_generator = pipeline("text-generation", model="Qwen/Qwen2.5-1.5B", device_map="auto", return_full_text=False)
    prompt = f"Give me a story under 100 words based upon: {text}."
    generated = story_generator(prompt, max_new_tokens=140, do_sample=True)
    story_text = generated[0]['generated_text']

    # Remove leading punctuation if present
    story_text = re.sub(r'^\s*[.,!?;:]+\s*', '', story_text)
    story_text = story_text.lstrip()
    
    # Capitalize the first alphabetic character if it's lowercase
    for i, char in enumerate(story_text):
        if char.isalpha():
            if char.islower():
                story_text = story_text[:i] + char.upper() + story_text[i+1:]
            break
    
    # Split into sentences
    sentences = re.split(r'(?<=[.!?])\s+', story_text.strip())
    
    # Initialize variables
    current_word_count = 0
    final_sentences = []
    
    # Iterate through each sentence and accumulate until the word count is within 100
    for sentence in sentences:
        words = sentence.split()
        word_count = len(words)
        if current_word_count + word_count > 100:
            break
        final_sentences.append(sentence)
        current_word_count += word_count
    
    # Join the final sentences to form the story
    final_story = ' '.join(final_sentences)
    
    # Ensure it ends with a punctuation mark
    if not final_story.endswith(('.', '!', '?')):
        final_story += '.'
    
    return final_story

def text2audio(story_text):
    audio_generator = pipeline("text-to-speech", "microsoft/speecht5_tts")
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    audio_output = audio_generator(story_text, forward_params={"speaker_embeddings": speaker_embedding})
    return audio_output

def main():
    st.set_page_config(page_title="Once Upon A Time - Storytelling Application", page_icon="πŸ“–")
    st.header("Create a story of yours with an image!πŸ§™")

    uploaded_file = st.file_uploader("Upload an image for creating your story!")

    if uploaded_file is not None:
        bytes_data = uploaded_file.getvalue()
        with open(uploaded_file.name, "wb") as file:
            file.write(bytes_data)

        st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)

        st.text('Entering the scene...🏰')
        scenario = img2text(uploaded_file.name)
        st.write(scenario)

        st.text('Your story is going to begin...πŸ¦„')
        story = text2story(scenario)
        st.write(story)

        st.text('Your story is going to be told...🎧')
        audio_data = text2audio(story)
        st.audio(audio_data['audio'], format="audio/wav", start_time=0, sample_rate=audio_data['sampling_rate'])

if __name__ == "__main__":
    main()