okuparinen commited on
Commit
fa969c9
·
verified ·
1 Parent(s): 2cd29aa

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +40 -2
README.md CHANGED
@@ -23,9 +23,47 @@ You can use this model for automatic dialectal transcription of Finnish dialects
23
 
24
  ## How to Get Started with the Model
25
 
26
- Use the code below to get started with the model.
 
 
 
 
27
 
28
- TBA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  ### Training Data
31
 
 
23
 
24
  ## How to Get Started with the Model
25
 
26
+ ```
27
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer
28
+ from datasets import Dataset, Audio
29
+ import torch
30
+ import pandas as pd
31
 
32
+ ds = pd.read_csv('CSV_DATA.csv')
33
+ ds = ds.dropna(how='any', axis=0)
34
+
35
+ test = Dataset.from_pandas(skn_test)
36
+ test = test.cast_column("AUDIO_PATH_COLUMN", Audio(sampling_rate=16000))
37
+
38
+ tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("okuparinen/SKN_300m_simple", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
39
+ model = Wav2Vec2ForCTC.from_pretrained("okuparinen/SKN_300m_simple").to("cuda")
40
+ processor = Wav2Vec2Processor.from_pretrained("okuparinen/SKN_300m_simple", tokenizer=tokenizer)
41
+
42
+ def prepare_dataset(batch):
43
+ audio = batch["AUDIO_PATH"]
44
+ batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
45
+ batch["input_length"] = len(batch["input_values"])
46
+
47
+ return batch
48
+
49
+ test_ready = test.map(prepare_dataset, remove_columns=test.column_names)
50
+
51
+ length = len(test)
52
+ predictions = []
53
+
54
+ for i in range(0, length, 1):
55
+ input_dict = processor(test_ready[i]["input_values"], return_tensors="pt", padding=True)
56
+ logits = model(input_dict.input_values.to("cuda")).logits
57
+
58
+ pred_ids = torch.argmax(logits, dim=-1)[0]
59
+
60
+ prediction = processor.decode(pred_ids)
61
+ predictions.append(prediction)
62
+
63
+ with open("OUTFILE.txt", "w") as f_pred:
64
+ for line in predictions:
65
+ f_pred.write(line + '\n')
66
+ ```
67
 
68
  ### Training Data
69