Lazy-Val commited on
Commit
996b3fa
·
verified ·
1 Parent(s): 0ae9004

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -9,7 +9,7 @@ language:
9
  | --- | --- |
10
  | **Name** | `fr_trf_reflex_nrp` |
11
  | **Version** | `0.0.0` |
12
- | **spaCy** | `>=3.8.3,<3.9.0` |
13
  | **Default Pipeline** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
14
  | **Components** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
15
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
 
9
  | --- | --- |
10
  | **Name** | `fr_trf_reflex_nrp` |
11
  | **Version** | `0.0.0` |
12
+ | **spaCy** | `>=3.8.7,<3.9.0` |
13
  | **Default Pipeline** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
14
  | **Components** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
15
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
base_transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c140cc7cc6da464b19f21950129cdc7421053fb79ff5e34666fc9527905204a
3
- size 443537896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:273f8e8a1a291514e3dd91408f0e03434995c72b68fcddfb944f35897c8a0429
3
+ size 443538293
config.cfg CHANGED
@@ -16,7 +16,7 @@ before_creation = null
16
  after_creation = null
17
  after_pipeline_creation = null
18
  batch_size = 1000
19
- tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
  vectors = {"@vectors":"spacy.Vectors.v1"}
21
 
22
  [components]
@@ -90,7 +90,7 @@ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotati
90
 
91
  [components.ner_transformer.model]
92
  @architectures = "spacy-transformers.TransformerModel.v3"
93
- name = "dbmdz/bert-base-german-cased"
94
  mixed_precision = false
95
 
96
  [components.ner_transformer.model.get_spans]
@@ -228,21 +228,24 @@ eps = 0.00000001
228
  learn_rate = 0.001
229
 
230
  [training.score_weights]
231
- ents_f = 0.2
232
  ents_p = 0.0
233
  ents_r = 0.0
234
  ents_per_type = null
235
- pos_acc = 0.1
236
- morph_acc = 0.1
237
  morph_per_feat = null
238
- tag_acc = 0.2
239
- dep_uas = 0.1
240
- dep_las = 0.1
 
 
 
241
  dep_las_per_type = null
242
  sents_p = null
243
  sents_r = null
244
  sents_f = 0.0
245
- lemma_acc = 0.2
246
 
247
  [pretraining]
248
 
 
16
  after_creation = null
17
  after_pipeline_creation = null
18
  batch_size = 1000
19
+ tokenizer = {"@tokenizers":"customize_tokenizer"}
20
  vectors = {"@vectors":"spacy.Vectors.v1"}
21
 
22
  [components]
 
90
 
91
  [components.ner_transformer.model]
92
  @architectures = "spacy-transformers.TransformerModel.v3"
93
+ name = "almanach/camembertav2-base"
94
  mixed_precision = false
95
 
96
  [components.ner_transformer.model.get_spans]
 
228
  learn_rate = 0.001
229
 
230
  [training.score_weights]
231
+ ents_f = 0.22
232
  ents_p = 0.0
233
  ents_r = 0.0
234
  ents_per_type = null
235
+ pos_acc = 0.0
236
+ morph_acc = 0.11
237
  morph_per_feat = null
238
+ tag_acc = 0.22
239
+ tag_micro_p = null
240
+ tag_micro_r = null
241
+ tag_micro_f = null
242
+ dep_uas = 0.11
243
+ dep_las = 0.11
244
  dep_las_per_type = null
245
  sents_p = null
246
  sents_r = null
247
  sents_f = 0.0
248
+ lemma_acc = 0.22
249
 
250
  [pretraining]
251
 
fr_trf_reflex_nrp-0.0.0-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a864066b3962b7185b3eaca0bf0cc55674ffc1a84e7f3a9fa4e3c41e20f50c1f
3
- size 822780428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:719a5cd2ea2461b701b8535bfbaa591227cb6a63ba60549c1c6f09efcd3c27bc
3
+ size 824582048
meta.json CHANGED
@@ -7,8 +7,8 @@
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
- "spacy_version":">=3.8.3,<3.9.0",
11
- "spacy_git_version":"be0fa81",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
@@ -586,7 +586,7 @@
586
 
587
  ],
588
  "requirements":[
589
- "spacy-transformers>=1.3.8,<1.4.0",
590
- "spacy>=3.8.3,<3.9.0"
591
  ]
592
  }
 
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
+ "spacy_version":">=3.8.7,<3.9.0",
11
+ "spacy_git_version":"4b65aa7",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
 
586
 
587
  ],
588
  "requirements":[
589
+ "spacy-transformers>=1.3.9,<1.4.0",
590
+ "spacy>=3.8.7,<3.9.0"
591
  ]
592
  }
ner/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:188374b00a1220f136095477c9f81dcb148c5c35b84d38c03f5bf24098c4e4f2
3
  size 219690
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ed9d24f9c390a1c7c7ad6bb84a402dafa34d3210d998e2a365e2830f9590edf
3
  size 219690
ner/moves CHANGED
@@ -1 +1 @@
1
- ��movesٴ{"0":{},"1":{"ORG":5328,"LOC":3766,"PER":3605},"2":{"ORG":5328,"LOC":3766,"PER":3605},"3":{"ORG":5328,"LOC":3766,"PER":3605},"4":{"ORG":5328,"LOC":3766,"PER":3605,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��movesٴ{"0":{},"1":{"ORG":6029,"LOC":4464,"PER":3965},"2":{"ORG":6029,"LOC":4464,"PER":3965},"3":{"ORG":6029,"LOC":4464,"PER":3965},"4":{"ORG":6029,"LOC":4464,"PER":3965,"":1},"5":{"":1}}�cfg��neg_key�
ner_transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa876964843bdadeeaf73bebae2103e27ca655f50c96fd666d6e5ffd58d10c8f
3
- size 440759212
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:755e7434a8f1df5e8dede50927501cb06e1264baa504ddb2dd909fbbf1740da6
3
+ size 443538293
tokenizer CHANGED
The diff for this file is too large to render. See raw diff
 
use_custom_tokenizer.py CHANGED
@@ -24,17 +24,26 @@ EXTENDED_LETTER_RANGE = "A-Za-zäöüÄÖÜàòèéìù"
24
  DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
25
  TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
26
 
 
 
 
 
 
 
27
  DOT_AFTER_WORD = [
28
- rf"(?<!www\.)(?<=([a-zA-ZäöüÄÖÜ]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
29
  for i in range(3, 30)
30
  ]
31
 
 
32
  DOT_AFTER_DATE = rf"(?<=({DATE}))\."
33
 
34
  infix_res = [
35
- r"[\(\[\]\)]",
36
  r"(?<=\.--)\.", # DOT after .--
37
- rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", # DOT before word
 
 
38
  r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
39
  *DOT_AFTER_WORD, # when there is no space after the dot
40
  r"[A-Z](?=\. )", # DOT after capital letter
@@ -45,6 +54,11 @@ LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH = [ # DOT after letter, e.g., A.G., or u.s.
45
  rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
46
  ]
47
 
 
 
 
 
 
48
  suffix_res = [
49
  r"(?<=\d)[\.]", # DOT after number
50
  r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
@@ -63,10 +77,12 @@ NUMBER_SIGN_NUMBER_FIXED = r"(?<=[0-9])[+\*^](?=[0-9])"
63
 
64
 
65
  # Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
66
- def custom_tokenizer(nlp):
67
  nlp.tokenizer = Tokenizer(nlp.vocab)
68
 
69
- prefix_regex = compile_prefix_regex(nlp.Defaults.prefixes)
 
 
70
  nlp.tokenizer.prefix_search = prefix_regex.search
71
 
72
  # We use the default infixes and remove some cases that lead to unwanted tokenization.
 
24
  DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
25
  TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
26
 
27
+ DOT_BEFORE_WORD = [
28
+ rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{{i}}})(?!(({TOP_LEVEL_DOMAINS})\b))"
29
+ for i in range(3, 30)
30
+ ]
31
+
32
+
33
  DOT_AFTER_WORD = [
34
+ rf"(?<!www\.)(?<=([{EXTENDED_LETTER_RANGE}]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
35
  for i in range(3, 30)
36
  ]
37
 
38
+
39
  DOT_AFTER_DATE = rf"(?<=({DATE}))\."
40
 
41
  infix_res = [
42
+ r"[\(\[\]\)\,\»\>\«\<]", # brackets, commas and ' that are without space
43
  r"(?<=\.--)\.", # DOT after .--
44
+ r"(?<=\,)\'",
45
+ *DOT_BEFORE_WORD, # DOT before word
46
+ # rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})",
47
  r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
48
  *DOT_AFTER_WORD, # when there is no space after the dot
49
  r"[A-Z](?=\. )", # DOT after capital letter
 
54
  rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
55
  ]
56
 
57
+ prefix_res = [
58
+ rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", # DOT before word
59
+ *DOT_BEFORE_WORD, # DOT before word
60
+ ]
61
+
62
  suffix_res = [
63
  r"(?<=\d)[\.]", # DOT after number
64
  r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
 
77
 
78
 
79
  # Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
80
+ def custom_tokenizer(nlp) -> Tokenizer:
81
  nlp.tokenizer = Tokenizer(nlp.vocab)
82
 
83
+ prefixes = nlp.Defaults.prefixes
84
+ prefixes += prefix_res
85
+ prefix_regex = compile_prefix_regex(prefixes)
86
  nlp.tokenizer.prefix_search = prefix_regex.search
87
 
88
  # We use the default infixes and remove some cases that lead to unwanted tokenization.
vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff