caiocof commited on
Commit
86832db
·
verified ·
1 Parent(s): 452c794

Add BERTopic model

Browse files
Files changed (4) hide show
  1. README.md +71 -0
  2. config.json +17 -0
  3. topic_embeddings.safetensors +3 -0
  4. topics.json +319 -0
README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - bertopic
5
+ library_name: bertopic
6
+ pipeline_tag: text-classification
7
+ ---
8
+
9
+ # jaria_topics
10
+
11
+ This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
12
+ BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
13
+
14
+ ## Usage
15
+
16
+ To use this model, please install BERTopic:
17
+
18
+ ```
19
+ pip install -U bertopic
20
+ ```
21
+
22
+ You can use the model as follows:
23
+
24
+ ```python
25
+ from bertopic import BERTopic
26
+ topic_model = BERTopic.load("caiocof/jaria_topics")
27
+
28
+ topic_model.get_topic_info()
29
+ ```
30
+
31
+ ## Topic overview
32
+
33
+ * Number of topics: 2
34
+ * Number of training documents: 206
35
+
36
+ <details>
37
+ <summary>Click here for an overview of all topics.</summary>
38
+
39
+ | Topic ID | Topic Keywords | Topic Frequency | Label |
40
+ |----------|----------------|-----------------|-------|
41
+ | 0 | de - do - da - no - que | 182 | 0_de_do_da_no |
42
+ | 1 | legalidade - do - ait - da - princípio | 24 | 1_legalidade_do_ait_da |
43
+
44
+ </details>
45
+
46
+ ## Training hyperparameters
47
+
48
+ * calculate_probabilities: False
49
+ * language: portuguese
50
+ * low_memory: False
51
+ * min_topic_size: 10
52
+ * n_gram_range: (1, 1)
53
+ * nr_topics: None
54
+ * seed_topic_list: None
55
+ * top_n_words: 10
56
+ * verbose: False
57
+ * zeroshot_min_similarity: 0.7
58
+ * zeroshot_topic_list: None
59
+
60
+ ## Framework versions
61
+
62
+ * Numpy: 1.26.4
63
+ * HDBSCAN: 0.8.40
64
+ * UMAP: 0.5.7
65
+ * Pandas: 1.5.3
66
+ * Scikit-Learn: 1.3.0
67
+ * Sentence-transformers: 4.0.1
68
+ * Transformers: 4.48.1
69
+ * Numba: 0.59.1
70
+ * Plotly: 5.22.0
71
+ * Python: 3.12.4
config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "calculate_probabilities": false,
3
+ "language": "portuguese",
4
+ "low_memory": false,
5
+ "min_topic_size": 10,
6
+ "n_gram_range": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "nr_topics": null,
11
+ "seed_topic_list": null,
12
+ "top_n_words": 10,
13
+ "verbose": false,
14
+ "zeroshot_min_similarity": 0.7,
15
+ "zeroshot_topic_list": null,
16
+ "embedding_model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
17
+ }
topic_embeddings.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2126562fe716a11fd8d4490a37598535d1bbc7ae24c61c566c15a23ff1741002
3
+ size 3160
topics.json ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "topic_representations": {
3
+ "0": [
4
+ [
5
+ "de",
6
+ 0.15117864812327939
7
+ ],
8
+ [
9
+ "do",
10
+ 0.12469059164150768
11
+ ],
12
+ [
13
+ "da",
14
+ 0.0721248453955329
15
+ ],
16
+ [
17
+ "no",
18
+ 0.07170129807531282
19
+ ],
20
+ [
21
+ "que",
22
+ 0.06912085045201405
23
+ ],
24
+ [
25
+ "infra\u00e7\u00e3o",
26
+ 0.06306215538126733
27
+ ],
28
+ [
29
+ "n\u00e3o",
30
+ 0.05912077066993812
31
+ ],
32
+ [
33
+ "ait",
34
+ 0.05799283987158575
35
+ ],
36
+ [
37
+ "art",
38
+ 0.04879239668017904
39
+ ],
40
+ [
41
+ "auto",
42
+ 0.0473727288250299
43
+ ]
44
+ ],
45
+ "1": [
46
+ [
47
+ "legalidade",
48
+ 0.1736186949633571
49
+ ],
50
+ [
51
+ "do",
52
+ 0.160291457783716
53
+ ],
54
+ [
55
+ "ait",
56
+ 0.15639416694312788
57
+ ],
58
+ [
59
+ "da",
60
+ 0.14308423677559173
61
+ ],
62
+ [
63
+ "princ\u00edpio",
64
+ 0.1426428220597304
65
+ ],
66
+ [
67
+ "que",
68
+ 0.14093961629350088
69
+ ],
70
+ [
71
+ "lavratura",
72
+ 0.1069821165447978
73
+ ],
74
+ [
75
+ "pilar",
76
+ 0.1059668007728723
77
+ ],
78
+ [
79
+ "apresenta",
80
+ 0.1033037537782839
81
+ ],
82
+ [
83
+ "se",
84
+ 0.10016463170962911
85
+ ]
86
+ ]
87
+ },
88
+ "topics": [
89
+ 0,
90
+ 0,
91
+ 0,
92
+ 1,
93
+ 0,
94
+ 0,
95
+ 0,
96
+ 0,
97
+ 0,
98
+ 1,
99
+ 0,
100
+ 0,
101
+ 0,
102
+ 0,
103
+ 0,
104
+ 0,
105
+ 0,
106
+ 1,
107
+ 0,
108
+ 0,
109
+ 0,
110
+ 0,
111
+ 0,
112
+ 0,
113
+ 0,
114
+ 0,
115
+ 0,
116
+ 0,
117
+ 1,
118
+ 0,
119
+ 0,
120
+ 0,
121
+ 0,
122
+ 0,
123
+ 0,
124
+ 0,
125
+ 0,
126
+ 0,
127
+ 0,
128
+ 1,
129
+ 0,
130
+ 0,
131
+ 0,
132
+ 1,
133
+ 0,
134
+ 0,
135
+ 0,
136
+ 0,
137
+ 1,
138
+ 0,
139
+ 0,
140
+ 1,
141
+ 0,
142
+ 0,
143
+ 0,
144
+ 0,
145
+ 0,
146
+ 0,
147
+ 0,
148
+ 1,
149
+ 0,
150
+ 0,
151
+ 0,
152
+ 0,
153
+ 0,
154
+ 0,
155
+ 0,
156
+ 0,
157
+ 0,
158
+ 0,
159
+ 0,
160
+ 0,
161
+ 0,
162
+ 0,
163
+ 0,
164
+ 0,
165
+ 0,
166
+ 0,
167
+ 0,
168
+ 0,
169
+ 1,
170
+ 0,
171
+ 0,
172
+ 0,
173
+ 0,
174
+ 0,
175
+ 0,
176
+ 0,
177
+ 0,
178
+ 0,
179
+ 0,
180
+ 0,
181
+ 1,
182
+ 0,
183
+ 0,
184
+ 0,
185
+ 0,
186
+ 1,
187
+ 0,
188
+ 0,
189
+ 0,
190
+ 0,
191
+ 0,
192
+ 0,
193
+ 0,
194
+ 1,
195
+ 0,
196
+ 0,
197
+ 0,
198
+ 0,
199
+ 0,
200
+ 0,
201
+ 0,
202
+ 0,
203
+ 0,
204
+ 0,
205
+ 0,
206
+ 0,
207
+ 0,
208
+ 0,
209
+ 0,
210
+ 1,
211
+ 1,
212
+ 0,
213
+ 1,
214
+ 0,
215
+ 0,
216
+ 0,
217
+ 0,
218
+ 0,
219
+ 1,
220
+ 0,
221
+ 1,
222
+ 0,
223
+ 0,
224
+ 0,
225
+ 0,
226
+ 1,
227
+ 0,
228
+ 1,
229
+ 0,
230
+ 0,
231
+ 0,
232
+ 0,
233
+ 0,
234
+ 0,
235
+ 0,
236
+ 0,
237
+ 0,
238
+ 0,
239
+ 0,
240
+ 0,
241
+ 1,
242
+ 0,
243
+ 0,
244
+ 0,
245
+ 0,
246
+ 0,
247
+ 0,
248
+ 0,
249
+ 0,
250
+ 0,
251
+ 0,
252
+ 0,
253
+ 0,
254
+ 0,
255
+ 0,
256
+ 0,
257
+ 0,
258
+ 1,
259
+ 0,
260
+ 0,
261
+ 0,
262
+ 0,
263
+ 0,
264
+ 0,
265
+ 0,
266
+ 0,
267
+ 0,
268
+ 0,
269
+ 0,
270
+ 0,
271
+ 0,
272
+ 1,
273
+ 0,
274
+ 0,
275
+ 0,
276
+ 0,
277
+ 0,
278
+ 0,
279
+ 0,
280
+ 1,
281
+ 0,
282
+ 0,
283
+ 0,
284
+ 0,
285
+ 0,
286
+ 0,
287
+ 0,
288
+ 0,
289
+ 0,
290
+ 0,
291
+ 0,
292
+ 0,
293
+ 0,
294
+ 0
295
+ ],
296
+ "topic_sizes": {
297
+ "0": 182,
298
+ "1": 24
299
+ },
300
+ "topic_mapper": [
301
+ [
302
+ 0,
303
+ 0,
304
+ 1
305
+ ],
306
+ [
307
+ 1,
308
+ 1,
309
+ 0
310
+ ]
311
+ ],
312
+ "topic_labels": {
313
+ "0": "0_de_do_da_no",
314
+ "1": "1_legalidade_do_ait_da"
315
+ },
316
+ "custom_labels": null,
317
+ "_outliers": 0,
318
+ "topic_aspects": {}
319
+ }