import pandas as pd import numpy as np from transformers import AutoTokenizer, AutoModel import torch # Load CSV file csv_file = "your_file.csv" # Path to your CSV file df = pd.read_csv(csv_file) # Assuming the column containing sentences is named 'text' sentences = df['text'].tolist() # Load Romanian BERT model and tokenizer model_name = 'dumitrescustefan/bert-base-romanian-cased-v1' tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # Function to get sentence embedding def get_sentence_embedding(sentence, model, tokenizer): inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128) with torch.no_grad(): outputs = model(**inputs) cls_embedding = outputs.last_hidden_state[:, 0, :] # CLS token embedding return cls_embedding.numpy() # Generate embeddings for all sentences embeddings = [get_sentence_embedding(sentence, model, tokenizer) for sentence in sentences] # Convert to numpy array embeddings = np.array(embeddings).reshape(len(sentences), -1) # Save embeddings to a file (optional) np.save("sentence_embeddings.npy", embeddings) # Save sentences for reference (optional) df['embeddings'] = embeddings.tolist() df.to_csv("embeddings_with_text.csv", index=False)