|
import streamlit as st |
|
import numpy as np |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
|
|
|
|
model_name = 'dumitrescustefan/bert-base-romanian-cased-v1' |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModel.from_pretrained(model_name) |
|
|
|
|
|
saved_embeddings = np.load("sentence_embeddings.npy") |
|
sentences = np.load("sentences.npy") |
|
|
|
|
|
def get_sentence_embedding(sentence, model, tokenizer): |
|
inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
cls_embedding = outputs.last_hidden_state[:, 0, :] |
|
return cls_embedding.numpy() |
|
|
|
|
|
st.title("Sentence Similarity with Pre-trained BERT") |
|
st.write("Enter a sentence in Romanian to find similar sentences.") |
|
|
|
|
|
user_input = st.text_input("Your sentence") |
|
|
|
|
|
if user_input: |
|
|
|
user_embedding = get_sentence_embedding(user_input, model, tokenizer) |
|
|
|
|
|
similarities = cosine_similarity(user_embedding, saved_embeddings.reshape(saved_embeddings.shape[0], -1)) |
|
|
|
|
|
top_n = 5 |
|
top_indices = np.argsort(similarities[0])[::-1][:top_n] |
|
|
|
st.write("Top similar sentences:") |
|
|
|
|
|
for idx in top_indices: |
|
st.write(f"Sentence: {sentences[idx]}") |
|
st.write(f"Similarity score: {similarities[0][idx]:.4f}") |
|
|