In [1]:
from collections import Counter
import random

import torch
import torch.nn.functional as F
import torch.utils.data as data
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from tqdm import tqdm

import torch.nn as nn
import torch.optim as optim
import math
import copy
import numpy as np


random.seed(0)
torch.manual_seed(seed=0)
torch.backends.cudnn.deterministic = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("using device:", device)

using device: cuda


In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value

        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)

        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x


class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x



class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, graph_type):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.graph_type= graph_type

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()

        # tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt, grf1, grf2, grf3, grf4):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))

        # print('src_embedded.shape', src_embedded.shape)

        grf_embedded1 = self.dropout(grf1)
        grf_embedded2 = self.dropout(grf2)
        grf_embedded3 = self.dropout(grf3)
        grf_embedded4 = self.dropout(grf4)

        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        if self.graph_type == "basic":
            enc_output = src_embedded # basic transformer
        elif self.graph_type == "egcr":
            enc_output = src_embedded + grf_embedded1 # basic paper
        elif self.graph_type == "pos":
            enc_output = src_embedded + grf_embedded2 # basicpaper transformer + pos graph
        elif self.graph_type == "dp":
            enc_output = src_embedded + grf_embedded3 # basicpaper transformer + DP graph
        elif self.graph_type == "semantic":
            enc_output = src_embedded + grf_embedded4 # basicpaper transformer + semantic graph
        else:
            enc_output = src_embedded + grf_embedded1 + grf_embedded2 + grf_embedded3 # basicpaper graph + pos graph + parsing-tree graph
        # print(grf_embedded.shape)

        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [3]:
def preprocess_english(sentence):
    """
    preprocess english senttence

    Param sentence: input sentence
    """

    sentence = sentence.strip()
    sentence = sentence.replace(';', '')
    return sentence

def preprocess_persian(sentence):
    """
    preprocess persian senttence

    Param sentence: input sentence
    """

    sentence = sentence.strip().split('///')[0]
    sentence = sentence.replace(';', '')
    return sentence

def process_input_line(l):
    """
    preprocess input en --> fa sentendes
    each sentence is contain a persian sentcence and some of persian sentncess seperated by "///"

    Param: l: input row (line)
    """
    l = l.strip().split('\t')
    return [preprocess_english(l[0]), preprocess_persian(l[1])]

def preprocess_tep_dataset(en_path, fa_path):
    """
    making text pairs for tep dataset

    """

    en_lines = []
    with open(en_path) as f:
        en_lines = f.readlines()
    en_lines = [line.strip() for line in en_lines]

    fa_lines = []
    with open(fa_path) as f:
        fa_lines = f.readlines()
    fa_lines = [line.strip() for line in fa_lines]

    text_pairs = []
    for en, fa in zip(en_lines, fa_lines):
        text_pairs.append([en, fa])

    return text_pairs




In [4]:

# download file
# download tep datste
import os
if not os.path.exists('tep-data.zip'):
    !wget -O tep-data.zip https://object.pouta.csc.fi/OPUS-TEP/v1/moses/en-fa.txt.zip


if not os.path.exists('TEP.en-fa.en'):
    !unzip tep-data.zip


--2024-04-16 11:53:53--  https://object.pouta.csc.fi/OPUS-TEP/v1/moses/en-fa.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16353318 (16M) [application/zip]
Saving to: ‘tep-data.zip’


2024-04-16 11:53:55 (10.2 MB/s) - ‘tep-data.zip’ saved [16353318/16353318]

Archive:  tep-data.zip
  inflating: TEP.en-fa.en            
  inflating: TEP.en-fa.fa            
  inflating: README                  


In [5]:
# nltk modules
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
# data pr3eprocessing and limit
text_pairs = preprocess_tep_dataset('TEP.en-fa.en', 'TEP.en-fa.fa')
text_pairs = text_pairs[:10000] # get just 10000 first rows


lengths = []
for src, tgt in text_pairs:
    l = sent_tokenize(src)
    for item in l:
        lengths.append(len(item))

print("max sentecne length is: ", max(lengths))


all_data = [(src.lower(), tgt) for src, tgt in text_pairs if len(src) > 0 and len(src) < 800 and len(tgt) < 800]



max sentecne length is:  156


In [9]:
# import pandas as pd
# data_df = {'persian':[], 'english':[]}


# for li in all_data:
#     data_df['english'].append(li[0])
#     data_df['persian'].append(li[1])

# data_df = pd.DataFrame(data_df)
# data_df.to_csv('tep_slice.csv', index=False, encoding='utf-8-sig')

In [22]:
# train test split

# random.shuffle(all_data)
n_val = int(0.15*len(all_data))
n_train = len(all_data) - 2*n_val
train_pairs = all_data[:n_train]
val_pairs = all_data[n_train:n_train+n_val]
test_pairs = all_data[n_train+n_val:]

train_data = [(src, tgt) for src, tgt in train_pairs if len(src) > 0]
val_data = [(src, tgt) for src, tgt in val_pairs if len(src) > 0]
test_data = [(src, tgt) for src, tgt in test_pairs if len(src) > 0]

print("train data samples count is: ", len(train_data))
print("test data samples count is: ", len(test_data))
print("validation data samples count is: ", len(val_data))




train data samples count is:  7000
test data samples count is:  1500
validation data samples count is:  1500


In [23]:
# build vocab
# Build src and tgt vocabs from the training set.
PAD, BOS, EOS = ("<PAD>", "<START>", "<END>")
tokenizer = get_tokenizer("basic_english")
en_counter, de_counter = Counter(), Counter()
for src, tgt in all_data:
    en_counter.update(tokenizer(src))
    de_counter.update(tokenizer(tgt))
de_vocab = vocab(de_counter, specials=[PAD, BOS, EOS])
en_vocab = vocab(en_counter, specials=[PAD, BOS, EOS])
pad_idx = de_vocab[PAD] # pad_idx is 0
assert en_vocab[PAD] == de_vocab[PAD]




In [24]:
# graph modules
# We assume that PyTorch is already installed
import torch
torchversion = torch.__version__

# Install PyTorch Scatter, PyTorch Sparse, and PyTorch Geometric
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-{torchversion}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git



# Visualization
import networkx as nx
import matplotlib.pyplot as plt


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone


In [25]:
import torch
from transformers import BertTokenizer, BertModel
import transformers
from sklearn.metrics.pairwise import cosine_similarity


In [26]:
def get_token_vector(bert_tokenizer, bert_embedding_model, token):
    token_bert = bert_tokenizer([token], return_tensors='pt')
    with torch.no_grad():
        outputs = bert_embedding_model(token_bert['input_ids'], attention_mask=token_bert['attention_mask'])
        word_embeddings = outputs.last_hidden_state
        if len(word_embeddings[0]) == 3:
            return word_embeddings[0][1]
        else:
            return torch.zeros(word_embeddings[0].shape[1])

In [27]:
train_pairs[20][0]

'alice , we cant make it rain together turn the dust into grass .'

'c'

In [28]:


bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_embedding_model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [29]:
sentence_temp = train_pairs[20][0].split(' ')
sentence_temp

['alice',
 ',',
 'we',
 'cant',
 'make',
 'it',
 'rain',
 'together',
 'turn',
 'the',
 'dust',
 'into',
 'grass',
 '.']

In [30]:
from scipy.spatial.distance import cosine


In [31]:

token1 = 'human'
token2 = 'man'
word1 = get_token_vector(bert_tokenizer, bert_embedding_model, token1)

word2 = get_token_vector(bert_tokenizer, bert_embedding_model, token2)

similarity_score = cosine(word1, word2)

# Print the similarity score
print("Cosine Similarity Score:", similarity_score, token1, token2)

Cosine Similarity Score: 0.29613661766052246 human man


In [32]:
token_bert = bert_tokenizer('cant', return_tensors='pt')
with torch.no_grad():
    outputs = bert_embedding_model(token_bert['input_ids'], attention_mask=token_bert['attention_mask'])
    word_embeddings = outputs.last_hidden_state

word_embeddings[0]

tensor([[-0.7788, -0.2292, -0.0473,  ..., -0.3022,  0.3633,  0.7057],
        [-0.0265, -0.6340,  0.2995,  ..., -0.0779,  0.3846, -0.3527],
        [-0.9770, -1.0330,  0.1572,  ...,  0.4266,  0.0926,  0.0622],
        [ 0.8668,  0.0340, -0.2735,  ...,  0.1848, -0.7666, -0.2814]])

In [33]:
token_bert = bert_tokenizer('cant not', return_tensors='pt')
with torch.no_grad():
    outputs = bert_embedding_model(token_bert['input_ids'], attention_mask=token_bert['attention_mask'])
    word_embeddings = outputs.last_hidden_state

word_embeddings[0].shape[1]

768

In [34]:
bert_tokenizer('i love you')

{'input_ids': [101, 1045, 2293, 2017, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [None]:


# Generate embeddings using BERT model


In [35]:
word_embeddings[0].shape

torch.Size([5, 768])

In [36]:
similarity_score = cosine_similarity(word_embeddings[0], word_embeddings[0])

# Print the similarity score
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.99999964


In [37]:
# extracting graph of word similarity

import gensim.downloader as api
glove_model = api.load("glove-wiki-gigaword-100")



In [38]:
import numpy as np

def get_word_vector(glove_model, word):
    try:
        return glove_model[word]
    except:
        return np.zeros(100, dtype="float32")

def vector_distance(vec1, vec2):
    dist = np.linalg.norm(vec1-vec2)
    return dist


def token_similarity(t1, t2, glove_model):
    v1 = get_word_vector(glove_model, t1)
    v2 = get_word_vector(glove_model, t2)
    return vector_distance(v2, v1)

In [39]:

from torch_geometric.data import Data
from torch.nn import Linear, ReLU
from torch_geometric.nn import Sequential, GCNConv
from torch_geometric.nn import GCN, summary
import spacy
from spacy import displacy
dp_nlp = spacy.load("en_core_web_sm")



def extract_graph_DP(vocab, sentecnes, nlp):
    node_id = 0
    X = list()
    edges = list()

    node_poses = {}

    for sentence in sentecnes:
        doc = nlp(' '.join(sentence))
        token_to_node = {}
        for token_idx in range(len(sentence)):
            token = sentence[token_idx]

            try:
                X.append(np.array(vocab([token])))
            except:
                X.append(np.zeros(1, dtype=np.int32))

            token_to_node[str(token)] = node_id

            node_id += 1



        for token in doc:
            if token_to_node.get(str(token.head.text), []) and token_to_node.get(str(token.text), []):
                edges.append([token_to_node.get(str(token.head.text)), token_to_node.get(str(token.text)),])

    return X, edges



def extract_graph_POS(vocab, sentecnes):
    node_id = 0
    X = list()
    edges = list()

    node_poses = {}

    for sentence in sentecnes:
        pos_tags = nltk.pos_tag(sentence)
        for token_idx in range(len(sentence)):
            token = sentence[token_idx]
            pos = pos_tags[token_idx][1]

            try:
                X.append(np.array(vocab([token])))
            except:
                X.append(np.zeros(1, dtype=np.int32))



            # comapre with other nodes
            for n_id in node_poses.get(pos, []):
                edge = [node_id, n_id]
                edges.append(edge)

            # adding node text to dict of text nodes
            if node_poses.get(pos, 0):
                node_poses[pos].append(node_id)
            else:
                node_poses[pos] = [node_id]

            node_id += 1

    return X, edges



def extract_graph_AMR(vocab, sentecnes, stog):
    node_id = 0
    X = list()
    edges = list()
    node_text = dict()

    graphs = stog.parse_sents([' '.join(sent) for sent in sentecnes], add_metadata=True, )
    for sentence, geraph in zip(sentecnes, graphs):
        # print(sentence)
        # print(geraph)
        penman_graph = penman.decode(geraph)
        key_to_token = {instance.source:instance.target for instance in penman_graph.instances()}
        token_to_node = {}
        for token_idx in range(len(sentence)):
            token = sentence[token_idx]

            try:
                X.append(np.array(vocab([token])))
            except:
                X.append(np.zeros(1, dtype=np.int32))

            token_to_node[token] = node_id
            node_id += 1

        # print(token_to_node)
        # print(key_to_token)

        for edg in penman_graph.edges():
            source = key_to_token[edg.source]
            target = key_to_token[edg.target]
            try:
                edge = [token_to_node[source.split('-0')[0]], token_to_node[target.split('-0')[0]]]
                edges.append(edge)
            except:
                pass

    return X, edges



def extract_graph_EGCR(vocab, sentecnes):
    node_id = 0
    X = list()
    edges = list()
    node_text = dict()

    for sentence in sentecnes:
        for token_idx in range(len(sentence)):
            token = sentence[token_idx]

            try:
                X.append(np.array(vocab([token])))
            except:
                X.append(np.zeros(1, dtype=np.int32))

            # add sentence nodes
            if token_idx != 0:
                edge = [node_id, node_id-1]
                edges.append(edge)

            # comapre with other nodes
            for n_id in node_text.get(token, []):
                edge = [node_id, n_id]
                edges.append(edge)

            # adding node text to dict of text nodes
            if node_text.get(token, 0):
                node_text[token].append(node_id)
            else:
                node_text[token] = [node_id]

            node_id += 1

    return X, edges


def extract_graph_EGCR1(vocab, sentecnes):
    node_id = 0
    X = list()
    edges = list()
    node_text = dict()
    start_node_ids = []
    for sentence in sentecnes:
        start_node_id = node_id
        start_node_ids.append(start_node_id)
        for token_idx in range(len(sentence)):
            # sentence entitiy relation
            if start_node_id != node_id:
                edge = [start_node_id, node_id]
                edges.append(edge)
            token = sentence[token_idx]

            try:
                X.append(np.array(vocab([token])))
            except:
                X.append(np.zeros(1, dtype=np.int32))

            # add sentence nodes
            if token_idx != 0:
                edge = [node_id, node_id-1]
                edges.append(edge)

            # comapre with other nodes
            for n_id in node_text.get(token, []):
                edge = [node_id, n_id]
                edges.append(edge)

            # adding node text to dict of text nodes
            if node_text.get(token, 0):
                node_text[token].append(node_id)
            else:
                node_text[token] = [node_id]

            node_id += 1

    # inter sentence connection
    G = nx.complete_graph(start_node_ids)
    _edges = list(G.edges)
    _edges = [[_i[0], _i[1]]for _i in _edges]
    for edge in _edges:
        edges.append(edge)
    return X, edges

def extract_sentence_vectors(sentences, word_vectors):
    out = {}
    counter = 0
    for sentence in sentences:
        # print(sentence)

        text = ' '.join(sentence)
        # out[text] = tf.convert_to_tensor(torch.mean(word_vectors[counter:len(sentence)+counter], dim=0).to(torch.int64).detach().numpy())
        out[text] = torch.mean(word_vectors[counter:len(sentence)+counter], dim=0).to(torch.float32).detach()
        counter += len(sentence)

    return out


def extract_sentence_vectors_v2(sentences, word_vectors, out_size):
    out = list()
    counter = 0
    for sentence in sentences:
        # print(sentence)

        text = ' '.join(sentence)
        # out[text] = tf.convert_to_tensor(torch.mean(word_vectors[counter:len(sentence)+counter], dim=0).to(torch.int64).detach().numpy())
        avg = torch.mean(word_vectors[counter:len(sentence)+counter], dim=0)
        avg = torch.reshape(avg, out_size)
        # out.append( torch.abs(avg).to(torch.int32).detach() )
        out.append(avg.detach())
        counter += len(sentence)

    return out

def extract_graph_semantic(vocab, sentecnes, glove_model):
    node_id = 0
    X = list()
    edges = list()
    sim_threshold = 3

    node_poses = {}
    all_tokens = list()
    for sentence in sentecnes:
        for token_idx in range(len(sentence)):
            token = sentence[token_idx]

            try:
                X.append(np.array(vocab([token])))
            except:
                X.append(np.zeros(1, dtype=np.int32))

            for idx in range(len(all_tokens)):
                t = all_tokens[idx]
                sim = token_similarity(t, token, glove_model)
                if sim <= sim_threshold:
                    edge = [node_id, idx]
                    edges.append(edge)


            node_id += 1
            all_tokens.append(token)


    return X, edges

In [None]:
# pip install pyvis

# temp_sentenecs = [
#  ('do you recognize any of them .', 'هيچ کدوم از اينا را ميشناسي .'),
#  ('but theyre coming faster now and that can only mean one thing .',
#   'اما اونا حالا تندتر ميومدند و معني اين فقط يک چيز بود .'),
#  ('can we just try this .', ' ميتونيم فقط امتحانش کنيم .')]


#  sentences_tokens = [tokenizer(item[0].lower()) for item in temp_sentenecs]

# import networkx as nx
# import pylab as plt

# # train_graph_X, train_edges = extract_graph_DP(en_vocab, sentences_tokens, dp_nlp)
# # train_graph_X, train_edges = extract_graph_EGCR1(en_vocab, sentences_tokens)
# # train_graph_X, train_edges = extract_graph_POS(en_vocab, sentences_tokens)
# train_graph_X, train_edges =  extract_graph_semantic(vocab, sentences_tokens, glove_model)

# G=nx.Graph()
# # Add nodes and edges
# G.add_edges_from([(sentences_tokens_list[item[0]], sentences_tokens_list[item[1]]) for item in train_edges])

# from pyvis.network import Network
# import networkx as nx
# nx_graph = G
# nt = Network()
# # populates the nodes and edges data structures
# nt.from_nx(nx_graph)
# nt.show('semantic.html',  notebook=False)

In [40]:

def get_graph_vectors(batch, d_model, glove_model, g_type="EGCR"):

    sentences_tokens = [tokenizer(item[0].lower()) for item in batch]
    if g_type == "EGCR":
        train_graph_X, train_edges = extract_graph_EGCR(en_vocab, sentences_tokens)
    elif g_type == "POS":
        train_graph_X, train_edges = extract_graph_POS(en_vocab, sentences_tokens)
    elif g_type == "AMR":
        train_graph_X, train_edges = extract_graph_AMR(en_vocab, sentences_tokens, stog)
    elif g_type == "DP":
        train_graph_X, train_edges =  extract_graph_DP(vocab, sentences_tokens, dp_nlp)
    elif g_type == "semantic":
        train_graph_X, train_edges =  extract_graph_DP(vocab, sentences_tokens, glove_model)

    # print(train_edges)
    train_edges = torch.tensor( [[ item[0] for item in train_edges], [ item[1] for item in train_edges]], dtype=torch.long)
    train_graph_X = torch.tensor(np.array(train_graph_X), dtype=torch.float32 )

    # train_g_data = Data(x=train_graph_X, edge_index=train_edges)

    in_channels = 1
    max_senetnce_length = max([len(item) for item in sentences_tokens])
    out_channels =  max_senetnce_length * d_model
    # out_channels = d_model
    model = Sequential('x, edge_index', [
        (GCNConv(in_channels, 64), 'x, edge_index -> x'),
        ReLU(inplace=True),
        (GCNConv(64, 64), 'x, edge_index -> x'),
        ReLU(inplace=True),
        Linear(64, out_channels),
    ])

    word_vectors = model(train_graph_X, train_edges)



    vectros = extract_sentence_vectors_v2(sentences_tokens, word_vectors, (max_senetnce_length, d_model))

    return vectros


In [41]:
# model parametes
src_vocab_size = len(en_vocab)
tgt_vocab_size = len(de_vocab)
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = max([ max([len(src), len(tgt)]) for src,tgt in all_data]) + 1
dropout = 0.1

In [42]:
#  train , test, val loader

batch_size = 32
train_loader = data.DataLoader(
    dataset=train_data,
    # batch_size=batch_size, shuffle=True, drop_last=True,
    batch_size=batch_size,
    collate_fn=lambda batch: (
        pad_sequence(
            [torch.LongTensor(en_vocab(tokenizer(x))) for x, _ in batch],
            batch_first=True, padding_value=pad_idx),
        pad_sequence(
            [torch.LongTensor(de_vocab([BOS] + tokenizer(y) + [EOS])) for _, y in batch],
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'EGCR'),
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'POS'),
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'DP'),
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'semantic'),
            batch_first=True, padding_value=pad_idx
            ),

        ),

    num_workers=0,
)


val_loader = data.DataLoader(
    dataset=val_data,
    # batch_size=batch_size, shuffle=True, drop_last=True,
    batch_size=batch_size,
    collate_fn=lambda batch: (
        pad_sequence(
            [torch.LongTensor(en_vocab(tokenizer(x))) for x, _ in batch],
            batch_first=True, padding_value=pad_idx),
        pad_sequence(
            [torch.LongTensor(de_vocab([BOS] + tokenizer(y) + [EOS])) for _, y in batch],
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'EGCR'),
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'POS'),
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'DP'),
            batch_first=True, padding_value=pad_idx
            ),
        pad_sequence(
            get_graph_vectors(batch, d_model, 'semantic'),
            batch_first=True, padding_value=pad_idx
            ),

    ),
    num_workers=0,
)



test_loader = data.DataLoader(
    dataset=test_data,
    # batch_size=batch_size, shuffle=True, drop_last=True,
    batch_size=batch_size,
    collate_fn=lambda batch: (
        pad_sequence(
            [torch.LongTensor(en_vocab(tokenizer(x))) for x, _ in batch],
            batch_first=True, padding_value=pad_idx),
        pad_sequence(
            [torch.LongTensor(de_vocab([BOS] + tokenizer(y) + [EOS])) for _, y in batch],
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'EGCR'),
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'POS'),
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'DP'),
            batch_first=True, padding_value=pad_idx
            ),

        pad_sequence(
            get_graph_vectors(batch, d_model, 'semantic'),
            batch_first=True, padding_value=pad_idx
            ),


    ),
    num_workers=0,
)

In [43]:
import re

def remove_dot_from_text(text):
    """
    most of token in pred is ., so we will remove them
    """
    text = text.replace('.', '')
    text = re.sub(' +', ' ', text)
    return text

In [44]:
# evaluation modules
import numpy as np
from tqdm import tqdm
import pandas as pd
import plotly.express as px

from nltk.translate.bleu_score import sentence_bleu


!pip install -q evaluate

import evaluate
import json

!pip install -q jiwer

from jiwer import wer

# ! pip install --force-reinstall kaleido==0.1.0post1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [47]:
epochs = 1


In [48]:
for g_tyep in ["basic", "egcr", "pos", "dp", "semantic", "all"]:
    try:
        del transformer
    except:
        pass
    transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, g_tyep)
    transformer = transformer.to(device)


    # train val
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

    # pbar = tqdm(total=epochs*64, desc="Iteration")
    trainingEpoch_loss = []
    validationEpoch_loss = []
    for epoch in range(epochs):
        step_loss = []
        transformer.train()
        for src, tgt, grf1, grf2, grf3, grf4 in train_loader:

            src = src.to(device)
            tgt = tgt.to(device)
            grf1 = grf1.to(device)
            grf2 = grf2.to(device)
            grf3 = grf3.to(device)
            grf4 = grf4.to(device)

            # print(src.shape, grf.shape)
            # print(src.shape)
            # print(grf.shape)
            optimizer.zero_grad()
            output = transformer(src, tgt[:, :-1], grf1, grf2, grf3, grf4)
            loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt[:, 1:].contiguous().view(-1))
            loss.backward()
            optimizer.step()
            step_loss.append(loss.item())

        trainingEpoch_loss.append(np.array(step_loss).mean())

        # model.eval()     # Optional when not using Model Specific layer
        transformer.eval()
        validationStep_loss = []
        for src, tgt, grf1, grf2, grf3, grf4 in val_loader:
            src = src.to(device)
            tgt = tgt.to(device)
            grf1 = grf1.to(device)
            grf2 = grf2.to(device)
            grf3 = grf3.to(device)
            grf4 = grf4.to(device)

            # Forward Pass
            output = transformer(src, tgt[:, :-1], grf1, grf2, grf3, grf4)
            # Find the Loss
            validation_loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt[:, 1:].contiguous().view(-1))
            # Calculate Loss
            validationStep_loss.append(validation_loss.item())

        validationEpoch_loss.append(np.array(validationStep_loss).mean())


        print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

        # call back
        if len(validationEpoch_loss) > 3 and validationEpoch_loss[-1] >= validationEpoch_loss[-2] and validationEpoch_loss[-2] >= validationEpoch_loss[-3]:
            break


    # save model plots in a csv file
    pd_data = {'Train Loss': trainingEpoch_loss,
            'Validation Loss': validationEpoch_loss,
            'Epoch': list(range(1, len(trainingEpoch_loss)+1))}

    # Create DataFrame
    plot_df = pd.DataFrame(pd_data)

    plot_df.head()
    plot_df.to_csv(f'{g_tyep}.csv')


    # Creating the Figure instance
    fig = px.line(plot_df, x='Epoch', y=['Train Loss', 'Validation Loss'])


    # adding different style parameters to the legend
    fig.update_layout(
        legend=dict(
            x=0.7,
            y=0.9,
            title="Loss",
            title_font_family="Times New Roman",
            font=dict(
                family="Courier",
                size=14,
                color="black"
            ),
            # borderwidth=1
        ),
        autosize=False,
        width=1200,
        height=600,
    )

    fig.show()
    # fig.write_image(f"{g_tyep}.png", format='png',engine='kaleido')

    # evaluate

    transformer.eval()

    # # Generate random sample validation data
    # val_src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
    # val_tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

    pred_sentences = []
    real_sentences = []


    with torch.no_grad():
        for src, tgt, grf1, grf2, grf3, grf4 in test_loader:
            src = src.to(device)
            tgt = tgt.to(device)
            grf1 = grf1.to(device)
            grf2 = grf2.to(device)
            grf3 = grf3.to(device)
            grf4 = grf4.to(device)

            val_output = transformer(src, tgt[:, :-1], grf1, grf2, grf3, grf4)

            val_loss = criterion(val_output.contiguous().view(-1, tgt_vocab_size), tgt[:, 1:].contiguous().view(-1))

            pred_index_out = [ [ torch.argmax(token_preds).item() for token_preds in item] for item in val_output]
            pred_token_out = [ de_vocab.lookup_tokens(indices) for indices in pred_index_out]
            pred_sentences += [ [token  for token in tokens if token not in ['<START>', '<END>', '<PAD>'] ] for tokens in pred_token_out ]

            real_token_out = [ de_vocab.lookup_tokens(indices.cpu().numpy()) for indices in tgt]
            real_sentences += [ [token  for token in tokens if token not in ['<START>', '<END>', '<PAD>'] ] for tokens in real_token_out ]


    real_sentences = [' '.join(item) for item in real_sentences]
    pred_sentences = [' '.join(item) for item in pred_sentences]


    real_sentences = [remove_dot_from_text(item) for item in real_sentences]
    pred_sentences = [remove_dot_from_text(item) for item in pred_sentences]
    real_sentences1 = [ [item] for item in real_sentences]

    # evluate


    bleu = evaluate.load("bleu")
    results = bleu.compute(predictions=pred_sentences, references=real_sentences1)
    json.dump( results, open( f"{g_tyep}_4.json", 'w' ) )

    bleu = evaluate.load("bleu")
    results = bleu.compute(predictions=pred_sentences, references=real_sentences1, max_order=1)
    json.dump( results, open( f"{g_tyep}_1.json", 'w' ) )

    wer_scores = [] # [bleu_score, pred, ref]
    for r_sample, p_sample in zip(real_sentences1, pred_sentences):
        error = wer(r_sample, p_sample)
        wer_scores.append([error, p_sample, r_sample[0]])

    print("wer error is: ", sum([item[0] for item in wer_scores])/len(wer_scores))

    temp_wer_dict = {"wer": sum([item[0] for item in wer_scores])/len(wer_scores)}
    json.dump( temp_wer_dict, open( f"{g_tyep}_wer.json", 'w' ) )

    wer_scores = sorted(wer_scores, reverse=True)

    with open(f"{g_tyep}_samples.txt", "w") as f:
        for item in wer_scores:
            item = [str(i) for i in item]
            f.write(' ;; '.join(item))
            f.write("\n")



Epoch: 1, Loss: 5.21380090713501


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

wer error is:  0.8589336263630183
Epoch: 1, Loss: 5.243624210357666


wer error is:  0.8561898101717572
Epoch: 1, Loss: 5.208500385284424


wer error is:  0.8837455819341059
Epoch: 1, Loss: 5.316232681274414


wer error is:  0.8948310318728848
Epoch: 1, Loss: 5.2148966789245605


wer error is:  0.8507563824692773
Epoch: 1, Loss: 5.2134809494018555


wer error is:  0.8490530758133504


In [52]:
pip install -q transformers


In [53]:
pip install -q huggingface_hub

In [55]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want

In [56]:
! transformers-cli -h


2024-04-16 12:41:22.002906: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-16 12:41:22.002979: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-16 12:41:22.004290: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
usage: transformers-cli <command> [<args>]

positional arguments:
  {convert,download,env,run,serve,login,whoami,logout,repo,add-new-model,add-new-model-like,lfs-enable-largefiles,lfs-multipart-upload,pt-to-tf}
                        transformers-cli command helpers
    convert             CLI tool to run convert model from original author checkpoints to
       

In [58]:
! huggingface-cli repo create dummy-model

[90mgit version 2.34.1[0m
[90mgit-lfs/3.0.2 (GitHub; linux amd64; go 1.18.1)[0m

You are about to create [1mmohammad2928git/dummy-model[0m
Proceed? [Y/n] y

Your repo now lives at:
  [1mhttps://huggingface.co/mohammad2928git/dummy-model[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/mohammad2928git/dummy-model

