|
import pandas as pd |
|
import numpy as np |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
|
|
|
|
csv_file = "your_file.csv" |
|
df = pd.read_csv(csv_file) |
|
|
|
|
|
sentences = df['text'].tolist() |
|
|
|
|
|
model_name = 'dumitrescustefan/bert-base-romanian-cased-v1' |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModel.from_pretrained(model_name) |
|
|
|
|
|
def get_sentence_embedding(sentence, model, tokenizer): |
|
inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
cls_embedding = outputs.last_hidden_state[:, 0, :] |
|
return cls_embedding.numpy() |
|
|
|
|
|
embeddings = [get_sentence_embedding(sentence, model, tokenizer) for sentence in sentences] |
|
|
|
|
|
embeddings = np.array(embeddings).reshape(len(sentences), -1) |
|
|
|
|
|
np.save("sentence_embeddings.npy", embeddings) |
|
|
|
|
|
df['embeddings'] = embeddings.tolist() |
|
df.to_csv("embeddings_with_text.csv", index=False) |
|
|