ageng-anugrah commited on
Commit
5ef7b11
1 Parent(s): 77f3817

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +66 -0
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: id
3
+ tags:
4
+ - indobert
5
+ - indobenchmark
6
+ ---
7
+
8
+ ## How to use
9
+
10
+ ### Load model and tokenizer
11
+ ```python
12
+ from transformers import AutoTokenizer, AutoModel
13
+
14
+ tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
15
+ model = AutoModel.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
16
+ ```
17
+
18
+ ### Extract NER Tag
19
+ ```python
20
+ import torch
21
+ def predict(model, tokenizer, sentence):
22
+ # will be moved to config later
23
+ ids_to_labels = {
24
+ 0: 'B-ORGANISATION',
25
+ 1: 'B-PERSON',
26
+ 2: 'B-PLACE',
27
+ 3: 'I-ORGANISATION',
28
+ 4: 'I-PERSON',
29
+ 5: 'I-PLACE',
30
+ 6: 'O',
31
+ }
32
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
+ inputs = tokenizer(sentence.split(),
34
+ is_split_into_words = True,
35
+ return_offsets_mapping=True,
36
+ return_tensors="pt")
37
+
38
+ model.to(device)
39
+ # move to gpu
40
+ ids = inputs["input_ids"].to(device)
41
+ mask = inputs["attention_mask"].to(device)
42
+
43
+ # forward pass
44
+ outputs = model(ids, attention_mask=mask)
45
+ logits = outputs[0]
46
+
47
+ active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
48
+ flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
49
+
50
+ tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
51
+ token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
52
+ wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
53
+
54
+ prediction = []
55
+ for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
56
+ #only predictions on first word pieces are important
57
+ if mapping[0] == 0 and mapping[1] != 0:
58
+ prediction.append(token_pred[1])
59
+ else:
60
+ continue
61
+
62
+ return sentence.split(), prediction
63
+
64
+ sentence = "BJ Habibie adalah Presiden Indonesia ke-3"
65
+ words, labels = predict(model, tokenizer, sentence)
66
+ ```