import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

# Load CSV file
csv_file = "your_file.csv"  # Path to your CSV file
df = pd.read_csv(csv_file)

# Assuming the column containing sentences is named 'text'
sentences = df['text'].tolist()

# Load Romanian BERT model and tokenizer
model_name = 'dumitrescustefan/bert-base-romanian-cased-v1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to get sentence embedding
def get_sentence_embedding(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token embedding
    return cls_embedding.numpy()

# Generate embeddings for all sentences
embeddings = [get_sentence_embedding(sentence, model, tokenizer) for sentence in sentences]

# Convert to numpy array
embeddings = np.array(embeddings).reshape(len(sentences), -1)

# Save embeddings to a file (optional)
np.save("sentence_embeddings.npy", embeddings)

# Save sentences for reference (optional)
df['embeddings'] = embeddings.tolist()
df.to_csv("embeddings_with_text.csv", index=False)