import os import numpy as np from transformers import pipeline import speech_recognition as sr import gradio as gr import cv2 from PIL import Image import moviepy.editor as mp from gtts import gTTS from groq import Groq client = Groq( api_key="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", ) # Initialize pipelines image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1) audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim") text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2) conversation_history = [] def process_input(video_stream): if isinstance(video_stream, str): video_file_path = video_stream # Process video frames image_features_list = [] audio_emotion = "" text_input = "" text_emotions = "" cap = cv2.VideoCapture(video_file_path) frame_count = 0 while True: ret, frame = cap.read() if not ret: break # Convert frame to PIL image pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Analyze the image try: image_analysis = image_pipeline(pil_image) if image_analysis: image_features_list.append(image_analysis[0]['label']) except Exception as e: print(f"Error processing image data: {e}") # Increment frame count frame_count += 1 # Combine image features into a single string image_features = ', '.join(image_features_list) print("Image features:", image_features) # Process audio data and get the emotion label try: # Extract audio from the video file video_clip = mp.VideoFileClip(video_file_path) audio_file_path = os.path.join("/tmp", "audio.wav") video_clip.audio.write_audiofile(audio_file_path) recognizer = sr.Recognizer() with sr.AudioFile(audio_file_path) as source: audio = recognizer.record(source) # Convert audio data to numpy array audio_data = np.frombuffer(audio.frame_data, dtype=np.int16) audio_data = audio_data.astype(np.float32) # Convert to float32 audio_emotions = audio_pipeline(audio_data) if audio_emotions: audio_emotion = audio_emotions[0]['label'] print("Audio emotion:", audio_emotion) # Recognize audio text_input = recognizer.recognize_google(audio) print("User said:", text_input) except Exception as e: print(f"Error processing audio data: {e}") # Process text data and get the emotion label text_emotions = "" try: # Initialize text_input in case it's not set if not text_input: text_input = "" text_analysis = text_pipeline(text_input) print("text analysis:", text_analysis) if isinstance(text_analysis, list): # Flatten the list of lists text_analysis = [item for sublist in text_analysis for item in sublist] # Initialize an empty list to store the text emotions text_emotions_list = [] # Iterate through each item in the flattened list for item in text_analysis: # Ensure each item is a dictionary and contains the 'label' key if isinstance(item, dict) and 'label' in item: # Append the 'label' value to the text_emotions_list text_emotions_list.append(item['label']) # Check if text_emotions_list is empty if text_emotions_list: # Convert the text_emotions_list to a comma-separated string text_emotions = ', '.join(text_emotions_list) print("Text emotions:", text_emotions) else: text_emotions = "No significant emotions detected in the text." except Exception as e: print(f"Error processing text data: {e}") print("Text emotions:", text_emotions) conversation_history.append({ "user_input": text_input, "image_features": image_features, "audio_emotion": audio_emotion, "text_emotions": text_emotions }) prompt = "User said: " + text_input if image_features: prompt += "\nImage features: " + ', '.join(image_features) if audio_emotion: prompt += "\nAudio emotion: " + audio_emotion if text_emotions: prompt += "\nText emotions: " + text_emotions print("image_feature",image_features) print("Audio",audio_emotion) print("text emotions",text_emotions) chat_completion = client.chat.completions.create( messages=[ {"role": "system", "content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction." }, {"role": "user", "content": prompt } ], model="llama3-70b-8192", temperature=0.5, max_tokens=1024, top_p=1, stop=None, stream=False, ) ai_response = chat_completion.choices[0].message.content conversation_history.append({"ai_response": ai_response}) print(ai_response) # Convert AI response to audio tts = gTTS(text=ai_response, lang='en') audio_file_path = "/tmp/ai_response.wav" tts.save(audio_file_path) return ai_response,audio_file_path,display_history() # Return the generated response def display_history(): history_str = "" for i, turn in enumerate(conversation_history): # if "user_input" in turn: # history_str += f"User: {turn['user_input']}\n" if "ai_response" in turn: history_str += f"{turn['ai_response']}\n\n" return history_str # Create the Gradio interface input_video = gr.Video(sources="webcam",label="Your Video", include_audio=True) output_text = gr.Textbox(label="Therapist Response") output_audio=gr.Audio(autoplay=True,visible=False) history_text = gr.Textbox(display_history(), label="Conversation History", placeholder="") iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text,output_audio,history_text], title="Mental Health Therapist", description="Speak to the AI through video input and get responses.",theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"),allow_flagging="auto") iface.launch(debug=True)