Add BERTopic model

Browse files

Files changed (4) hide show

README.md +71 -0
config.json +17 -0
topic_embeddings.safetensors +3 -0
topics.json +319 -0

README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+---
+tags:
+- bertopic
+library_name: bertopic
+pipeline_tag: text-classification
+---
+# jaria_topics
+This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
+BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
+## Usage
+To use this model, please install BERTopic:
+```
+pip install -U bertopic
+```
+You can use the model as follows:
+```python
+from bertopic import BERTopic
+topic_model = BERTopic.load("caiocof/jaria_topics")
+topic_model.get_topic_info()
+```
+## Topic overview
+* Number of topics: 2
+* Number of training documents: 206
+<details>
+  <summary>Click here for an overview of all topics.</summary>
+  | Topic ID | Topic Keywords | Topic Frequency | Label |
+|----------|----------------|-----------------|-------|
+| 0 | de - do - da - no - que | 182 | 0_de_do_da_no |
+| 1 | legalidade - do - ait - da - princípio | 24 | 1_legalidade_do_ait_da |
+</details>
+## Training hyperparameters
+* calculate_probabilities: False
+* language: portuguese
+* low_memory: False
+* min_topic_size: 10
+* n_gram_range: (1, 1)
+* nr_topics: None
+* seed_topic_list: None
+* top_n_words: 10
+* verbose: False
+* zeroshot_min_similarity: 0.7
+* zeroshot_topic_list: None
+## Framework versions
+* Numpy: 1.26.4
+* HDBSCAN: 0.8.40
+* UMAP: 0.5.7
+* Pandas: 1.5.3
+* Scikit-Learn: 1.3.0
+* Sentence-transformers: 4.0.1
+* Transformers: 4.48.1
+* Numba: 0.59.1
+* Plotly: 5.22.0
+* Python: 3.12.4

config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "calculate_probabilities": false,
+  "language": "portuguese",
+  "low_memory": false,
+  "min_topic_size": 10,
+  "n_gram_range": [
+    1,
+    1
+  ],
+  "nr_topics": null,
+  "seed_topic_list": null,
+  "top_n_words": 10,
+  "verbose": false,
+  "zeroshot_min_similarity": 0.7,
+  "zeroshot_topic_list": null,
+  "embedding_model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+}

topic_embeddings.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2126562fe716a11fd8d4490a37598535d1bbc7ae24c61c566c15a23ff1741002
+size 3160

topics.json ADDED Viewed

	@@ -0,0 +1,319 @@

+{
+  "topic_representations": {
+    "0": [
+      [
+        "de",
+        0.15117864812327939
+      ],
+      [
+        "do",
+        0.12469059164150768
+      ],
+      [
+        "da",
+        0.0721248453955329
+      ],
+      [
+        "no",
+        0.07170129807531282
+      ],
+      [
+        "que",
+        0.06912085045201405
+      ],
+      [
+        "infra\u00e7\u00e3o",
+        0.06306215538126733
+      ],
+      [
+        "n\u00e3o",
+        0.05912077066993812
+      ],
+      [
+        "ait",
+        0.05799283987158575
+      ],
+      [
+        "art",
+        0.04879239668017904
+      ],
+      [
+        "auto",
+        0.0473727288250299
+      ]
+    ],
+    "1": [
+      [
+        "legalidade",
+        0.1736186949633571
+      ],
+      [
+        "do",
+        0.160291457783716
+      ],
+      [
+        "ait",
+        0.15639416694312788
+      ],
+      [
+        "da",
+        0.14308423677559173
+      ],
+      [
+        "princ\u00edpio",
+        0.1426428220597304
+      ],
+      [
+        "que",
+        0.14093961629350088
+      ],
+      [
+        "lavratura",
+        0.1069821165447978
+      ],
+      [
+        "pilar",
+        0.1059668007728723
+      ],
+      [
+        "apresenta",
+        0.1033037537782839
+      ],
+      [
+        "se",
+        0.10016463170962911
+      ]
+    ]
+  },
+  "topics": [
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    1,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0
+  ],
+  "topic_sizes": {
+    "0": 182,
+    "1": 24
+  },
+  "topic_mapper": [
+    [
+      0,
+      0,
+      1
+    ],
+    [
+      1,
+      1,
+      0
+    ]
+  ],
+  "topic_labels": {
+    "0": "0_de_do_da_no",
+    "1": "1_legalidade_do_ait_da"
+  },
+  "custom_labels": null,
+  "_outliers": 0,
+  "topic_aspects": {}
+}