Spaces:
Sleeping
Sleeping
created a function for generating keywords for bodies of text
Browse files- config.py +1 -0
- example.env +1 -0
- rag_app/utils/generate_keywords_bert.py +82 -0
config.py
CHANGED
@@ -9,6 +9,7 @@ SQLITE_FILE_NAME = os.getenv('SOURCES_CACHE')
|
|
9 |
PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
|
10 |
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
|
11 |
SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
|
|
|
12 |
|
13 |
|
14 |
db = DataBaseHandler()
|
|
|
9 |
PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
|
10 |
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
|
11 |
SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
|
12 |
+
BERT_MODEL = os.getenv("BERT_MODEL")
|
13 |
|
14 |
|
15 |
db = DataBaseHandler()
|
example.env
CHANGED
@@ -27,3 +27,4 @@ LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
|
27 |
LLM_MODEL_ARGS=
|
28 |
|
29 |
SEVEN_B_LLM_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
|
|
|
|
27 |
LLM_MODEL_ARGS=
|
28 |
|
29 |
SEVEN_B_LLM_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
|
30 |
+
BERT_MODEL="paraphrase-multilingual-MiniLM-L12-v2"
|
rag_app/utils/generate_keywords_bert.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple, Dict, Any
|
2 |
+
from keybert import KeyBERT
|
3 |
+
|
4 |
+
def extract_keywords_from_doc(
|
5 |
+
doc: str,
|
6 |
+
model_name: str = "paraphrase-multilingual-MiniLM-L12-v2",
|
7 |
+
**kwargs: Dict[str, Any]
|
8 |
+
) -> List[Tuple[str, float]]:
|
9 |
+
"""
|
10 |
+
## Summary
|
11 |
+
Extract keywords from a document using the KeyBERT model.
|
12 |
+
|
13 |
+
## Parameters:
|
14 |
+
doc (str): The document from which to extract keywords.
|
15 |
+
model_name (str): The name of the model to use. Default is "paraphrase-multilingual-MiniLM-L12-v2".
|
16 |
+
**kwargs (Dict[str, Any]): Additional keyword arguments for the extract_keywords method.
|
17 |
+
Possible keyword arguments include:
|
18 |
+
- top_n (int): The number of top keywords to return.
|
19 |
+
- keyphrase_ngram_range (Tuple[int, int]): The ngram range for the keyphrases.
|
20 |
+
- stop_words (str): The stop words to use.
|
21 |
+
- use_maxsum (bool): Whether to use Max Sum Similarity.
|
22 |
+
- use_mmr (bool): Whether to use Maximal Marginal Relevance.
|
23 |
+
- diversity (float): The diversity parameter for MMR.
|
24 |
+
- nr_candidates (int): The number of candidates for Max Sum Similarity.
|
25 |
+
|
26 |
+
## Returns:
|
27 |
+
List[Tuple[str, float]]: A list of tuples containing keywords and their corresponding scores.
|
28 |
+
|
29 |
+
## Example:
|
30 |
+
doc = \"\"\"
|
31 |
+
Supervised learning is the machine learning task of learning a function that
|
32 |
+
maps an input to an output based on example input-output pairs. It infers a
|
33 |
+
function from labeled training data consisting of a set of training examples.
|
34 |
+
In supervised learning, each example is a pair consisting of an input object
|
35 |
+
(typically a vector) and a desired output value (also called the supervisory signal).
|
36 |
+
A supervised learning algorithm analyzes the training data and produces an inferred function,
|
37 |
+
which can be used for mapping new examples. An optimal scenario will allow for the
|
38 |
+
algorithm to correctly determine the class labels for unseen instances. This requires
|
39 |
+
the learning algorithm to generalize from the training data to unseen situations in a
|
40 |
+
'reasonable' way (see inductive bias).
|
41 |
+
\"\"\"
|
42 |
+
|
43 |
+
keywords = extract_keywords_from_doc(
|
44 |
+
doc,
|
45 |
+
top_n=10,
|
46 |
+
keyphrase_ngram_range=(1, 2),
|
47 |
+
stop_words='english',
|
48 |
+
use_maxsum=True,
|
49 |
+
nr_candidates=20
|
50 |
+
)
|
51 |
+
print(keywords)
|
52 |
+
"""
|
53 |
+
kw_model = KeyBERT(model=model_name)
|
54 |
+
keywords = kw_model.extract_keywords(doc, **kwargs)
|
55 |
+
return keywords
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
|
59 |
+
# Example usage
|
60 |
+
doc = """
|
61 |
+
Supervised learning is the machine learning task of learning a function that
|
62 |
+
maps an input to an output based on example input-output pairs. It infers a
|
63 |
+
function from labeled training data consisting of a set of training examples.
|
64 |
+
In supervised learning, each example is a pair consisting of an input object
|
65 |
+
(typically a vector) and a desired output value (also called the supervisory signal).
|
66 |
+
A supervised learning algorithm analyzes the training data and produces an inferred function,
|
67 |
+
which can be used for mapping new examples. An optimal scenario will allow for the
|
68 |
+
algorithm to correctly determine the class labels for unseen instances. This requires
|
69 |
+
the learning algorithm to generalize from the training data to unseen situations in a
|
70 |
+
'reasonable' way (see inductive bias).
|
71 |
+
"""
|
72 |
+
|
73 |
+
# Example of passing additional keyword arguments
|
74 |
+
keywords = extract_keywords_from_doc(
|
75 |
+
doc,
|
76 |
+
top_n=10,
|
77 |
+
keyphrase_ngram_range=(1, 2),
|
78 |
+
stop_words='english',
|
79 |
+
use_maxsum=True,
|
80 |
+
nr_candidates=20
|
81 |
+
)
|
82 |
+
print(keywords)
|