AitBAD commited on
Commit
edf6cf4
·
verified ·
1 Parent(s): c2bf59a

Update backend.py

Browse files

Corrected a few typos and completed a few missing rules.
This is a work in progress.

Files changed (1) hide show
  1. backend.py +11 -10
backend.py CHANGED
@@ -33,19 +33,20 @@ class KabyleASR:
33
  str: The post-processed text with correct annexation dashes.
34
  """
35
  # Dictionaries for a set of particles based on the rules document v4, with corrections.
36
- CoPa = {'d', 'n', 'ur', 'i'}
37
- PoPro = {'inu', 'inem', 'ines', 'nneɣ', 'ntex', 'nwen', 'nwent', 'nsen', 'nsent',
38
- 'iw', 'ik', 'im', 'is', 'w', 'k', 'm', 's', 'tneɣ', 'tentex', 'tsen', 'tsent'}
39
- SpWo = {'deg', 'gar', 'ɣer', 'ɣur', 'fell', 'ɣef', 'ddaw', 'nnig', 'ɣid', 'aql', 'sɣur', 'sennig', 'deffir', 'sdat'}
 
40
  # Split StPa into a main group and a special group based on user feedback
41
  StPaSp = {'i', 'am', 'at', 's', 'neɣ', 'aɣ'}
42
  StPa = {'ak', 'as', 'aneɣ', 'anteɣ', 'awen', 'awent', 'asen', 'asent',
43
- 'k', 'm', 'ntex', 'wen', 'went', 'sen', 'sent', 'akem', 'att',
44
- 'aken', 'akent', 'aten', 'atent'}
45
- DePa = {'a', 'agi', 'nni', 'ihin', 'nniden'}
46
  DiPa = {'id', 'in'}
47
- FuPa = {'ad', 'ara'}
48
- DiObPa = {'yi', 'k', 'kem', 't', 'tt', 'ay', 'ken', 'kent', 'ten', 'tent',
49
  'iyi', 'ik', 'ikem', 'it', 'itt', 'iken', 'ikent', 'iten', 'itent'}
50
  InObPa = {'yi', 'yak', 'yam', 'yas', 'yaɣ', 'yawen', 'yawent', 'yasen', 'yasent'}
51
  # Combined set for general lookup, including both StPa groups
@@ -53,7 +54,7 @@ class KabyleASR:
53
  # The set of particles that can be annexed according to Rule 9, now correctly excluding StPaSp
54
  rule_9_particles = DiObPa.union(InObPa).union(DiPa).union(StPa)
55
  # The full set of state particles for other rules (like Rule 5 and 11)
56
- full_stpa_set = StPa.union(StPaSp)
57
  # Particles that can be part of the chain after FuPa (Rule 11)
58
  rule_11_particles = DiObPa.union(DiPa).union(full_stpa_set)
59
  # First, tokenize the text by splitting on spaces and existing dashes
 
33
  str: The post-processed text with correct annexation dashes.
34
  """
35
  # Dictionaries for a set of particles based on the rules document v4, with corrections.
36
+ # Based on kabyle_asr_optimized.py
37
+ CoPa = {'d', 'n', 's'}
38
+ PoPro = {'inu', 'inem', 'ines', 'nneɣ', 'nteɣ', 'nwen', 'nwent', 'nsen', 'nsent',
39
+ 'iw', 'ik', 'im', 'is', 'w', 'k', 'm', 'tneɣ', 'tenteɣ', 'twen', 'twent', 'tsen', 'tsent'}
40
+ SpWo = {'deg', 'gar', 'ɣer', 'ɣur', 'fell', 'ɣef', 'ddaw', 'nnig', 'yid', 'yes', 'yis', 'aql', 'sɣur', 'sennig', 'deffir', 'sdat'}
41
  # Split StPa into a main group and a special group based on user feedback
42
  StPaSp = {'i', 'am', 'at', 's', 'neɣ', 'aɣ'}
43
  StPa = {'ak', 'as', 'aneɣ', 'anteɣ', 'awen', 'awent', 'asen', 'asent',
44
+ 'k', 'm', 'nteɣ', 'wen', 'went', 'sen', 'sent', 'atneɣ', 'atenteɣ',
45
+ 'atwen', 'atwent', 'atsen', 'atsent'}
46
+ DePa = {'a', 'agi', 'nni', 'ihin', 'nniḍen'}
47
  DiPa = {'id', 'in'}
48
+ FuPa = {'ad', 'ara','ur'}
49
+ DiObPa = {'yi', 'k', 'kem', 't', 'tt', '', 'ken', 'kent', 'ten', 'tent',
50
  'iyi', 'ik', 'ikem', 'it', 'itt', 'iken', 'ikent', 'iten', 'itent'}
51
  InObPa = {'yi', 'yak', 'yam', 'yas', 'yaɣ', 'yawen', 'yawent', 'yasen', 'yasent'}
52
  # Combined set for general lookup, including both StPa groups
 
54
  # The set of particles that can be annexed according to Rule 9, now correctly excluding StPaSp
55
  rule_9_particles = DiObPa.union(InObPa).union(DiPa).union(StPa)
56
  # The full set of state particles for other rules (like Rule 5 and 11)
57
+ full_stpa_set = StPa.union(StPaSp).union(DiObPa) # Added DiObPa for Rule 5
58
  # Particles that can be part of the chain after FuPa (Rule 11)
59
  rule_11_particles = DiObPa.union(DiPa).union(full_stpa_set)
60
  # First, tokenize the text by splitting on spaces and existing dashes