import streamlit as st import re import nltk import os from nltk.corpus import stopwords from nltk import FreqDist from graphviz import Digraph from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics.pairwise import linear_kernel from io import BytesIO import base64 # Set page configuration with a title and favicon st.set_page_config( page_title="📺Transcript📜EDA🔍NLTK", page_icon="🌠", layout="wide", initial_sidebar_state="expanded", menu_items={ 'Get Help': 'https://huggingface.co/awacke1', 'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload", 'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558" } ) # Download NLTK resources nltk.download('punkt') nltk.download('stopwords') def remove_timestamps(text): return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text) def extract_high_information_words(text, top_n=10): words = nltk.word_tokenize(text) words = [word.lower() for word in words if word.isalpha()] stop_words = set(stopwords.words('english')) filtered_words = [word for word in words if word not in stop_words] freq_dist = FreqDist(filtered_words) return [word for word, _ in freq_dist.most_common(top_n)] def cluster_sentences(sentences, num_clusters): # Filter sentences with length over 10 characters sentences = [sentence for sentence in sentences if len(sentence) > 10] # Vectorize the sentences vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(sentences) # Perform k-means clustering kmeans = KMeans(n_clusters=num_clusters, random_state=42) kmeans.fit(X) # Calculate the centroid of each cluster cluster_centers = kmeans.cluster_centers_ # Group sentences by cluster and calculate similarity to centroid clustered_sentences = [[] for _ in range(num_clusters)] for i, label in enumerate(kmeans.labels_): similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0] clustered_sentences[label].append((similarity, sentences[i])) # Order sentences within each cluster based on their similarity to the centroid for cluster in clustered_sentences: cluster.sort(reverse=True) # Sort based on similarity (descending order) # Return the ordered clustered sentences without similarity scores for display return [[sentence for _, sentence in cluster] for cluster in clustered_sentences] # Function to convert text to a downloadable file def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"): buffer = BytesIO() buffer.write(text_to_download.encode()) buffer.seek(0) b64 = base64.b64encode(buffer.read()).decode() href = f'{button_label}' return href # Main code for UI uploaded_file = st.file_uploader("📁 Choose a .txt file", type=['txt']) if uploaded_file: file_text = uploaded_file.read().decode("utf-8") else: file_text = "" if file_text: text_without_timestamps = remove_timestamps(file_text) sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10] with st.expander("📝 Sentence Clustering"): num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5) clustered_sentences = cluster_sentences(sentences, num_clusters) for i, cluster in enumerate(clustered_sentences): st.text_area(f"Cluster {i+1}", value="\n".join(cluster), height=100) # Input for custom filename default_filename = f"Cluster_{i+1}_Output.txt" filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}") # Download button download_link = get_text_file_download_link("\n".join(cluster), filename, f"💾 Save Cluster {i+1}") st.markdown(download_link, unsafe_allow_html=True) st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")