Spaces:

georgeek
/

HF-LLM-Intent-Detection

Running

App Files Files Community

HF-LLM-Intent-Detection / src /Z ROM old page.py

georgeek

Transfer

5ecde30 16 days ago

raw

history blame contribute delete

1.73 kB

	import streamlit as st
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	from transformers import AutoTokenizer, AutoModel
	import torch

	# Load Romanian BERT model and tokenizer
	model_name = 'dumitrescustefan/bert-base-romanian-cased-v1'
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	# Load pre-saved embeddings and sentences
	saved_embeddings = np.load("sentence_embeddings.npy")
	sentences = np.load("sentences.npy")

	# Function to get sentence embedding
	def get_sentence_embedding(sentence, model, tokenizer):
	inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
	with torch.no_grad():
	outputs = model(**inputs)
	cls_embedding = outputs.last_hidden_state[:, 0, :] # CLS token embedding
	return cls_embedding.numpy()

	# Streamlit UI
	st.title("Sentence Similarity with Pre-trained BERT")
	st.write("Enter a sentence in Romanian to find similar sentences.")

	# User input
	user_input = st.text_input("Your sentence")

	# Check if user input exists
	if user_input:
	# Embed the user input
	user_embedding = get_sentence_embedding(user_input, model, tokenizer)

	# Compute similarity with saved embeddings
	similarities = cosine_similarity(user_embedding, saved_embeddings.reshape(saved_embeddings.shape[0], -1))

	# Get the top 5 most similar sentences
	top_n = 5
	top_indices = np.argsort(similarities[0])[::-1][:top_n]

	st.write("Top similar sentences:")

	# Display the most similar sentences with similarity scores
	for idx in top_indices:
	st.write(f"Sentence: {sentences[idx]}")
	st.write(f"Similarity score: {similarities[0][idx]:.4f}")