File size: 1,731 Bytes
5ecde30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import streamlit as st
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

# Load Romanian BERT model and tokenizer
model_name = 'dumitrescustefan/bert-base-romanian-cased-v1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load pre-saved embeddings and sentences
saved_embeddings = np.load("sentence_embeddings.npy")
sentences = np.load("sentences.npy")

# Function to get sentence embedding
def get_sentence_embedding(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token embedding
    return cls_embedding.numpy()

# Streamlit UI
st.title("Sentence Similarity with Pre-trained BERT")
st.write("Enter a sentence in Romanian to find similar sentences.")

# User input
user_input = st.text_input("Your sentence")

# Check if user input exists
if user_input:
    # Embed the user input
    user_embedding = get_sentence_embedding(user_input, model, tokenizer)

    # Compute similarity with saved embeddings
    similarities = cosine_similarity(user_embedding, saved_embeddings.reshape(saved_embeddings.shape[0], -1))

    # Get the top 5 most similar sentences
    top_n = 5
    top_indices = np.argsort(similarities[0])[::-1][:top_n]

    st.write("Top similar sentences:")
    
    # Display the most similar sentences with similarity scores
    for idx in top_indices:
        st.write(f"Sentence: {sentences[idx]}")
        st.write(f"Similarity score: {similarities[0][idx]:.4f}")