georgeek's picture
Transfer
5ecde30
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
# Load CSV file
csv_file = "your_file.csv" # Path to your CSV file
df = pd.read_csv(csv_file)
# Assuming the column containing sentences is named 'text'
sentences = df['text'].tolist()
# Load Romanian BERT model and tokenizer
model_name = 'dumitrescustefan/bert-base-romanian-cased-v1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Function to get sentence embedding
def get_sentence_embedding(sentence, model, tokenizer):
inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
cls_embedding = outputs.last_hidden_state[:, 0, :] # CLS token embedding
return cls_embedding.numpy()
# Generate embeddings for all sentences
embeddings = [get_sentence_embedding(sentence, model, tokenizer) for sentence in sentences]
# Convert to numpy array
embeddings = np.array(embeddings).reshape(len(sentences), -1)
# Save embeddings to a file (optional)
np.save("sentence_embeddings.npy", embeddings)
# Save sentences for reference (optional)
df['embeddings'] = embeddings.tolist()
df.to_csv("embeddings_with_text.csv", index=False)