Update spaCy pipeline
Browse files- README.md +1 -1
- base_transformer/model +2 -2
- config.cfg +12 -9
- fr_trf_reflex_nrp-0.0.0-py3-none-any.whl +2 -2
- meta.json +4 -4
- ner/model +1 -1
- ner/moves +1 -1
- ner_transformer/model +2 -2
- tokenizer +0 -0
- use_custom_tokenizer.py +21 -5
- vocab/strings.json +0 -0
README.md
CHANGED
@@ -9,7 +9,7 @@ language:
|
|
9 |
| --- | --- |
|
10 |
| **Name** | `fr_trf_reflex_nrp` |
|
11 |
| **Version** | `0.0.0` |
|
12 |
-
| **spaCy** | `>=3.8.
|
13 |
| **Default Pipeline** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
14 |
| **Components** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
15 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
|
|
9 |
| --- | --- |
|
10 |
| **Name** | `fr_trf_reflex_nrp` |
|
11 |
| **Version** | `0.0.0` |
|
12 |
+
| **spaCy** | `>=3.8.7,<3.9.0` |
|
13 |
| **Default Pipeline** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
14 |
| **Components** | `ner_transformer`, `ner`, `base_transformer`, `morphologizer`, `tagger`, `parser`, `trainable_lemmatizer` |
|
15 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
base_transformer/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:273f8e8a1a291514e3dd91408f0e03434995c72b68fcddfb944f35897c8a0429
|
3 |
+
size 443538293
|
config.cfg
CHANGED
@@ -16,7 +16,7 @@ before_creation = null
|
|
16 |
after_creation = null
|
17 |
after_pipeline_creation = null
|
18 |
batch_size = 1000
|
19 |
-
tokenizer = {"@tokenizers":"
|
20 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
21 |
|
22 |
[components]
|
@@ -90,7 +90,7 @@ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotati
|
|
90 |
|
91 |
[components.ner_transformer.model]
|
92 |
@architectures = "spacy-transformers.TransformerModel.v3"
|
93 |
-
name = "
|
94 |
mixed_precision = false
|
95 |
|
96 |
[components.ner_transformer.model.get_spans]
|
@@ -228,21 +228,24 @@ eps = 0.00000001
|
|
228 |
learn_rate = 0.001
|
229 |
|
230 |
[training.score_weights]
|
231 |
-
ents_f = 0.
|
232 |
ents_p = 0.0
|
233 |
ents_r = 0.0
|
234 |
ents_per_type = null
|
235 |
-
pos_acc = 0.
|
236 |
-
morph_acc = 0.
|
237 |
morph_per_feat = null
|
238 |
-
tag_acc = 0.
|
239 |
-
|
240 |
-
|
|
|
|
|
|
|
241 |
dep_las_per_type = null
|
242 |
sents_p = null
|
243 |
sents_r = null
|
244 |
sents_f = 0.0
|
245 |
-
lemma_acc = 0.
|
246 |
|
247 |
[pretraining]
|
248 |
|
|
|
16 |
after_creation = null
|
17 |
after_pipeline_creation = null
|
18 |
batch_size = 1000
|
19 |
+
tokenizer = {"@tokenizers":"customize_tokenizer"}
|
20 |
vectors = {"@vectors":"spacy.Vectors.v1"}
|
21 |
|
22 |
[components]
|
|
|
90 |
|
91 |
[components.ner_transformer.model]
|
92 |
@architectures = "spacy-transformers.TransformerModel.v3"
|
93 |
+
name = "almanach/camembertav2-base"
|
94 |
mixed_precision = false
|
95 |
|
96 |
[components.ner_transformer.model.get_spans]
|
|
|
228 |
learn_rate = 0.001
|
229 |
|
230 |
[training.score_weights]
|
231 |
+
ents_f = 0.22
|
232 |
ents_p = 0.0
|
233 |
ents_r = 0.0
|
234 |
ents_per_type = null
|
235 |
+
pos_acc = 0.0
|
236 |
+
morph_acc = 0.11
|
237 |
morph_per_feat = null
|
238 |
+
tag_acc = 0.22
|
239 |
+
tag_micro_p = null
|
240 |
+
tag_micro_r = null
|
241 |
+
tag_micro_f = null
|
242 |
+
dep_uas = 0.11
|
243 |
+
dep_las = 0.11
|
244 |
dep_las_per_type = null
|
245 |
sents_p = null
|
246 |
sents_r = null
|
247 |
sents_f = 0.0
|
248 |
+
lemma_acc = 0.22
|
249 |
|
250 |
[pretraining]
|
251 |
|
fr_trf_reflex_nrp-0.0.0-py3-none-any.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:719a5cd2ea2461b701b8535bfbaa591227cb6a63ba60549c1c6f09efcd3c27bc
|
3 |
+
size 824582048
|
meta.json
CHANGED
@@ -7,8 +7,8 @@
|
|
7 |
"email":"",
|
8 |
"url":"",
|
9 |
"license":"",
|
10 |
-
"spacy_version":">=3.8.
|
11 |
-
"spacy_git_version":"
|
12 |
"vectors":{
|
13 |
"width":0,
|
14 |
"vectors":0,
|
@@ -586,7 +586,7 @@
|
|
586 |
|
587 |
],
|
588 |
"requirements":[
|
589 |
-
"spacy-transformers>=1.3.
|
590 |
-
"spacy>=3.8.
|
591 |
]
|
592 |
}
|
|
|
7 |
"email":"",
|
8 |
"url":"",
|
9 |
"license":"",
|
10 |
+
"spacy_version":">=3.8.7,<3.9.0",
|
11 |
+
"spacy_git_version":"4b65aa7",
|
12 |
"vectors":{
|
13 |
"width":0,
|
14 |
"vectors":0,
|
|
|
586 |
|
587 |
],
|
588 |
"requirements":[
|
589 |
+
"spacy-transformers>=1.3.9,<1.4.0",
|
590 |
+
"spacy>=3.8.7,<3.9.0"
|
591 |
]
|
592 |
}
|
ner/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 219690
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ed9d24f9c390a1c7c7ad6bb84a402dafa34d3210d998e2a365e2830f9590edf
|
3 |
size 219690
|
ner/moves
CHANGED
@@ -1 +1 @@
|
|
1 |
-
��movesٴ{"0":{},"1":{"ORG":
|
|
|
1 |
+
��movesٴ{"0":{},"1":{"ORG":6029,"LOC":4464,"PER":3965},"2":{"ORG":6029,"LOC":4464,"PER":3965},"3":{"ORG":6029,"LOC":4464,"PER":3965},"4":{"ORG":6029,"LOC":4464,"PER":3965,"":1},"5":{"":1}}�cfg��neg_key�
|
ner_transformer/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:755e7434a8f1df5e8dede50927501cb06e1264baa504ddb2dd909fbbf1740da6
|
3 |
+
size 443538293
|
tokenizer
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
use_custom_tokenizer.py
CHANGED
@@ -24,17 +24,26 @@ EXTENDED_LETTER_RANGE = "A-Za-zäöüÄÖÜàòèéìù"
|
|
24 |
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
|
25 |
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
DOT_AFTER_WORD = [
|
28 |
-
rf"(?<!www\.)(?<=([
|
29 |
for i in range(3, 30)
|
30 |
]
|
31 |
|
|
|
32 |
DOT_AFTER_DATE = rf"(?<=({DATE}))\."
|
33 |
|
34 |
infix_res = [
|
35 |
-
r"[\(\[\]\)]",
|
36 |
r"(?<=\.--)\.", # DOT after .--
|
37 |
-
|
|
|
|
|
38 |
r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
|
39 |
*DOT_AFTER_WORD, # when there is no space after the dot
|
40 |
r"[A-Z](?=\. )", # DOT after capital letter
|
@@ -45,6 +54,11 @@ LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH = [ # DOT after letter, e.g., A.G., or u.s.
|
|
45 |
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
|
46 |
]
|
47 |
|
|
|
|
|
|
|
|
|
|
|
48 |
suffix_res = [
|
49 |
r"(?<=\d)[\.]", # DOT after number
|
50 |
r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
|
@@ -63,10 +77,12 @@ NUMBER_SIGN_NUMBER_FIXED = r"(?<=[0-9])[+\*^](?=[0-9])"
|
|
63 |
|
64 |
|
65 |
# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
|
66 |
-
def custom_tokenizer(nlp):
|
67 |
nlp.tokenizer = Tokenizer(nlp.vocab)
|
68 |
|
69 |
-
|
|
|
|
|
70 |
nlp.tokenizer.prefix_search = prefix_regex.search
|
71 |
|
72 |
# We use the default infixes and remove some cases that lead to unwanted tokenization.
|
|
|
24 |
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}"
|
25 |
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk"
|
26 |
|
27 |
+
DOT_BEFORE_WORD = [
|
28 |
+
rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{{i}}})(?!(({TOP_LEVEL_DOMAINS})\b))"
|
29 |
+
for i in range(3, 30)
|
30 |
+
]
|
31 |
+
|
32 |
+
|
33 |
DOT_AFTER_WORD = [
|
34 |
+
rf"(?<!www\.)(?<=([{EXTENDED_LETTER_RANGE}]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))"
|
35 |
for i in range(3, 30)
|
36 |
]
|
37 |
|
38 |
+
|
39 |
DOT_AFTER_DATE = rf"(?<=({DATE}))\."
|
40 |
|
41 |
infix_res = [
|
42 |
+
r"[\(\[\]\)\,\»\>\«\<]", # brackets, commas and ' that are without space
|
43 |
r"(?<=\.--)\.", # DOT after .--
|
44 |
+
r"(?<=\,)\'",
|
45 |
+
*DOT_BEFORE_WORD, # DOT before word
|
46 |
+
# rf"(?<!www)\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})",
|
47 |
r"'\.\.", # e.g., 'Tscheicha'.. -> "Tscheicha" "'..", then split ".." as suffix
|
48 |
*DOT_AFTER_WORD, # when there is no space after the dot
|
49 |
r"[A-Z](?=\. )", # DOT after capital letter
|
|
|
54 |
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30)
|
55 |
]
|
56 |
|
57 |
+
prefix_res = [
|
58 |
+
rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", # DOT before word
|
59 |
+
*DOT_BEFORE_WORD, # DOT before word
|
60 |
+
]
|
61 |
+
|
62 |
suffix_res = [
|
63 |
r"(?<=\d)[\.]", # DOT after number
|
64 |
r"(?<=[\.])[\]\)]", # Closing brackets with DOT before
|
|
|
77 |
|
78 |
|
79 |
# Given a nlp object, return a custom tokenizer that splits on special cases and with unwanted tokenization removed
|
80 |
+
def custom_tokenizer(nlp) -> Tokenizer:
|
81 |
nlp.tokenizer = Tokenizer(nlp.vocab)
|
82 |
|
83 |
+
prefixes = nlp.Defaults.prefixes
|
84 |
+
prefixes += prefix_res
|
85 |
+
prefix_regex = compile_prefix_regex(prefixes)
|
86 |
nlp.tokenizer.prefix_search = prefix_regex.search
|
87 |
|
88 |
# We use the default infixes and remove some cases that lead to unwanted tokenization.
|
vocab/strings.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|