import librosa
import numpy as np
import torch
from collections import Counter
import nltk
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

def get_pitch_list(y,sr):
    hop_length = int(sr / 30)  # hop_length determines how far apart the frames are

    # Extract the pitch (F0) using librosa's piptrack method
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=hop_length)

    # Get the pitch frequencies from the pitch array
    pitch_frequencies = []

    for t in range(pitches.shape[1]):
        index = magnitudes[:, t].argmax()  # Get the index of the maximum magnitude
        pitch = pitches[index, t]
        
        pitch_frequencies.append(pitch)

    # Convert pitch_frequencies to a NumPy array
    pitch_frequencies = np.array(pitch_frequencies)
    print("shape : ",pitch_frequencies.shape)
    return pitch_frequencies


def extract_audio_features(audio_path, asrmodel, asrproc, sentipipe, duration, wordcloud_path):
    y, sr = librosa.load(audio_path, sr=16000)
    inputs = asrproc(y, sampling_rate=sr, return_tensors="pt").input_features
    inputs = inputs.to(device, dtype=torch_dtype)
    with torch.no_grad():
        generated_ids = asrmodel.generate(inputs)
        transcript = asrproc.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Sound intensity (RMS)
    rms = librosa.feature.rms(y=y)
    sound_intensity = np.mean(rms)

    # Pitch list
    pitches=get_pitch_list(y,sr)

    # Fundamental frequency (F0)
    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    fundamental_frequency = np.nanmean(f0)

    # Spectral energy (based on STFT)
    S = np.abs(librosa.stft(y))
    spectral_energy = np.mean(np.sum(S ** 2, axis=0))

    # Spectral centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    avg_spectral_centroid = np.mean(spectral_centroid)

    # Zero-crossing rate
    zcr = librosa.feature.zero_crossing_rate(y)
    zero_crossing_rate = np.mean(zcr)

    # Pause detection
    silence_threshold = -40
    silent_intervals = librosa.effects.split(y, top_db=silence_threshold)
    pause_duration = 0
    for start, end in silent_intervals:
        pause_duration += (end - start) / sr

    total_duration = librosa.get_duration(y=y, sr=sr)
    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute

    # Transcript processing
    words = nltk.word_tokenize(transcript)
    words = [word.lower() for word in words if word not in string.punctuation]
    num_words = len(words)
    unique_words = len(set(words))
    word_frequencies = Counter(words)

    # Duration in minutes
    duration_minutes = total_duration / 60
    avg_words_per_minute = num_words / duration_minutes
    avg_unique_words_per_minute = unique_words / duration_minutes

    # Filler word detection
    filler_words = [
        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so', 
        'I mean', 'okay', 'right', 'actually', 'basically', 'you see', 
        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess', 
        'totally', 'honestly', 'seriously', 'alright'
    ]
    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
    filler_words_per_minute = filler_word_count / duration_minutes

    # POS tagging
    pos_tags = nltk.pos_tag(words)
    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]

    # Sentiment analysis
    sentiment = sentipipe(transcript)
    sentiment_mapping = {
        "LABEL_0": "Negative",
        "LABEL_1": "Neutral",
        "LABEL_2": "Positive"
    }
    sentiment[0]['label'] = sentiment_mapping[sentiment[0]['label']]

    # Generate Word Cloud and Save it as an Image
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequencies)

    # Save the Word Cloud to the provided path
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig(wordcloud_path, format='png')
    plt.close()

    print("Nouns: ", nouns)
    print("Adjectives: ", adjectives)
    print("Verbs: ", verbs)
    print("Sentiment: ", sentiment)

    return {
        "transcript": transcript,
        "sentiment": sentiment,
        "sound_intensity": float(sound_intensity),
        "fundamental_frequency": float(fundamental_frequency),
        "spectral_energy": float(spectral_energy),
        "spectral_centroid": float(avg_spectral_centroid),
        "zero_crossing_rate": float(zero_crossing_rate),
        "avg_words_per_minute": float(avg_words_per_minute),
        "avg_unique_words_per_minute": float(avg_unique_words_per_minute),
        "unique_word_count": int(unique_words),
        "filler_words_per_minute": float(filler_words_per_minute),
        "noun_count": len(nouns),
        "adjective_count": len(adjectives),
        "verb_count": len(verbs),
        "pause_rate": float(pause_rate)
    },pitches