alessandro trinca tornidor
commited on
Commit
·
d6e9ab3
1
Parent(s):
ff39414
test: add more test cases for get_wordnet_synonyms()
Browse files
my_ghost_writer/text_parsers2.py
CHANGED
|
@@ -6,6 +6,7 @@ import nltk
|
|
| 6 |
import pyinflect
|
| 7 |
import spacy
|
| 8 |
from fastapi import HTTPException
|
|
|
|
| 9 |
|
| 10 |
from my_ghost_writer.constants import ELIGIBLE_POS, NLTK_DATA, SPACY_MODEL_NAME, app_logger
|
| 11 |
from my_ghost_writer.custom_synonym_handler import CustomSynonymHandler
|
|
@@ -341,8 +342,13 @@ def get_wordnet_synonyms(word: str, pos_tag: Optional[str] = None) -> list[dict[
|
|
| 341 |
# TermRelationships.PERTAINYM,
|
| 342 |
TermRelationships.SIMILAR_TO
|
| 343 |
]:
|
|
|
|
|
|
|
|
|
|
| 344 |
result = _get_related_words(synset, rel_type, word_lower)
|
| 345 |
if result:
|
|
|
|
|
|
|
| 346 |
related_word_groups_raw.append(result)
|
| 347 |
|
| 348 |
except Exception as ex1:
|
|
|
|
| 6 |
import pyinflect
|
| 7 |
import spacy
|
| 8 |
from fastapi import HTTPException
|
| 9 |
+
from nltk.corpus.reader import Synset
|
| 10 |
|
| 11 |
from my_ghost_writer.constants import ELIGIBLE_POS, NLTK_DATA, SPACY_MODEL_NAME, app_logger
|
| 12 |
from my_ghost_writer.custom_synonym_handler import CustomSynonymHandler
|
|
|
|
| 342 |
# TermRelationships.PERTAINYM,
|
| 343 |
TermRelationships.SIMILAR_TO
|
| 344 |
]:
|
| 345 |
+
app_logger.info(f"synset: {type(synset)}, '{synset}'")
|
| 346 |
+
if not isinstance(synset, Synset):
|
| 347 |
+
pass
|
| 348 |
result = _get_related_words(synset, rel_type, word_lower)
|
| 349 |
if result:
|
| 350 |
+
if result["relation_type"] == TermRelationships.CAUSE:
|
| 351 |
+
app_logger.info(f"Adding result for relation type '{rel_type}': {result}")
|
| 352 |
related_word_groups_raw.append(result)
|
| 353 |
|
| 354 |
except Exception as ex1:
|
tests/events/get_wordnet_synonyms_day_ok1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"$.definition": {"types": ["str"], "primary_type": "str", "is_array": false, "samples": ["time for Earth to make a complete rotation on its axis"], "sample_count": 1}, "$.examples": {"types": ["list"], "primary_type": "list", "is_array": false, "samples": [], "sample_count": 0}, "$.examples[*]": {"types": ["array"], "primary_type": "array", "is_array": true, "samples": [], "sample_count": 0, "array_length": 2}, "$.related_words": {"types": ["list"], "primary_type": "list", "is_array": false, "samples": [], "sample_count": 0}, "$.related_words[*]": {"types": ["array"], "primary_type": "array", "is_array": true, "samples": [], "sample_count": 0, "array_length": 5}, "$.related_words[*].base_form": {"types": ["str"], "primary_type": "str", "is_array": false, "samples": ["twenty-four hours", "twenty-four hour period", "24-hour interval"], "sample_count": 3}, "$.relation_type": {"types": ["TermRelationships"], "primary_type": "TermRelationships", "is_array": false, "samples": ["TermRelationships.SYNONYM"], "sample_count": 1}, "$.source": {"types": ["str"], "primary_type": "str", "is_array": false, "samples": ["wordnet"], "sample_count": 1}, "$.wordnet_pos": {"types": ["str"], "primary_type": "str", "is_array": false, "samples": ["n"], "sample_count": 1}}
|
tests/events/get_wordnet_synonyms_tense_ok1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"$.definition": {"types": ["str"], "primary_type": "str", "is_array": false, "samples": ["a grammatical category of verbs used to express distinctions of time"], "sample_count": 1}, "$.examples": {"types": ["list"], "primary_type": "list", "is_array": false, "samples": [], "sample_count": 0}, "$.examples[*]": {"types": ["array"], "primary_type": "array", "is_array": true, "samples": [], "sample_count": 0, "array_length": 0}, "$.related_words": {"types": ["list"], "primary_type": "list", "is_array": false, "samples": [], "sample_count": 0}, "$.related_words[*]": {"types": ["array"], "primary_type": "array", "is_array": true, "samples": [], "sample_count": 0, "array_length": 2}, "$.related_words[*].base_form": {"types": ["str"], "primary_type": "str", "is_array": false, "samples": ["grammatical category", "syntactic category"], "sample_count": 2}, "$.relation_type": {"types": ["TermRelationships"], "primary_type": "TermRelationships", "is_array": false, "samples": ["TermRelationships.HYPERNYM"], "sample_count": 1}, "$.source": {"types": ["str"], "primary_type": "str", "is_array": false, "samples": ["wordnet"], "sample_count": 1}, "$.wordnet_pos": {"types": ["str"], "primary_type": "str", "is_array": false, "samples": ["n"], "sample_count": 1}}
|
tests/my_ghost_writer/helpers_tests.py
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
|
| 3 |
def analyze_detailed_report_lists(cls, detailed_report: dict, expected_detailed_report: dict):
|
|
@@ -8,3 +15,68 @@ def analyze_detailed_report_lists(cls, detailed_report: dict, expected_detailed_
|
|
| 8 |
del row_v["sample_count"]
|
| 9 |
del expected_row_v["sample_count"]
|
| 10 |
cls.assertDictEqual(row_v, expected_row_v)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
from nltk.corpus import wordnet as wn
|
| 4 |
+
|
| 5 |
+
from tests import EVENTS_FOLDER
|
| 6 |
+
from my_ghost_writer.jsonpath_extractor import JSONPathStructureAnalyzer
|
| 7 |
+
from my_ghost_writer.text_parsers2 import get_wordnet_synonyms
|
| 8 |
|
| 9 |
|
| 10 |
def analyze_detailed_report_lists(cls, detailed_report: dict, expected_detailed_report: dict):
|
|
|
|
| 15 |
del row_v["sample_count"]
|
| 16 |
del expected_row_v["sample_count"]
|
| 17 |
cls.assertDictEqual(row_v, expected_row_v)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def assert__json_structure__get_wordnet_synonyms(cls, word):
|
| 21 |
+
with open(EVENTS_FOLDER / f"get_wordnet_synonyms_{word}_ok1.json", "r") as src:
|
| 22 |
+
expected_detailed_report = json.load(src)
|
| 23 |
+
related_words = get_wordnet_synonyms(word)
|
| 24 |
+
first_related_words = related_words[0]
|
| 25 |
+
analyzer = JSONPathStructureAnalyzer()
|
| 26 |
+
analyzer.extract_all_paths(first_related_words)
|
| 27 |
+
detailed_report = analyzer.get_detailed_type_report()
|
| 28 |
+
analyze_detailed_report_lists(cls, detailed_report, expected_detailed_report)
|
| 29 |
+
# with open(EVENTS_FOLDER / f"get_wordnet_synonyms_{word}_ok1.json", "w") as src:
|
| 30 |
+
# json.dump(detailed_report, src)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def get_relationships(synset):
|
| 34 |
+
relationships = {
|
| 35 |
+
'synonyms': len(synset.lemma_names()),
|
| 36 |
+
'antonyms': sum(len(lemma.antonyms()) for lemma in synset.lemmas()),
|
| 37 |
+
'hypernyms': len(synset.hypernyms()),
|
| 38 |
+
'hyponyms': len(synset.hyponyms()),
|
| 39 |
+
'holonyms': len(synset.member_holonyms()) + len(synset.part_holonyms()) + len(synset.substance_holonyms()),
|
| 40 |
+
'meronyms': len(synset.member_meronyms()) + len(synset.part_meronyms()) + len(synset.substance_meronyms()),
|
| 41 |
+
'similar_tos': len(synset.similar_tos()),
|
| 42 |
+
'also_sees': len(synset.also_sees()),
|
| 43 |
+
'causes': len(synset.causes())
|
| 44 |
+
}
|
| 45 |
+
return relationships
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def extract_word_relationships():
|
| 49 |
+
results = []
|
| 50 |
+
holonym_results = []
|
| 51 |
+
cause_results = []
|
| 52 |
+
|
| 53 |
+
for pos in ['n', 'v']:
|
| 54 |
+
for synset in wn.all_synsets(pos):
|
| 55 |
+
rels = get_relationships(synset)
|
| 56 |
+
total = sum(1 for v in rels.values() if v > 0)
|
| 57 |
+
results.append((synset, total, rels))
|
| 58 |
+
if rels['holonyms'] > 0:
|
| 59 |
+
holonym_results.append((synset, rels['holonyms'], rels))
|
| 60 |
+
if rels['causes'] > 0:
|
| 61 |
+
cause_results.append((synset, rels['causes'], rels))
|
| 62 |
+
|
| 63 |
+
# Sort and get top 5
|
| 64 |
+
results.sort(key=lambda x: x[1], reverse=True)
|
| 65 |
+
holonym_results.sort(key=lambda x: x[1], reverse=True)
|
| 66 |
+
cause_results.sort(key=lambda x: x[1], reverse=True)
|
| 67 |
+
|
| 68 |
+
print("Top 5 synsets with most relationships:")
|
| 69 |
+
for synset, total, rels in results[:5]:
|
| 70 |
+
print(f"{synset.name()} ({synset.definition()}): {rels}")
|
| 71 |
+
|
| 72 |
+
print("\nTop 5 synsets with holonym relationships:")
|
| 73 |
+
for synset, count, rels in holonym_results[:5]:
|
| 74 |
+
print(f"{synset.name()} ({synset.definition()}): {rels}")
|
| 75 |
+
|
| 76 |
+
print("\nTop 5 synsets with cause relationships:")
|
| 77 |
+
for synset, count, rels in cause_results[:5]:
|
| 78 |
+
print(f"{synset.name()} ({synset.definition()}): {rels}")
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
extract_word_relationships()
|
tests/my_ghost_writer/test_text_parsers2.py
CHANGED
|
@@ -10,7 +10,8 @@ from my_ghost_writer.text_parsers2 import (extract_contextual_info_by_indices, g
|
|
| 10 |
from my_ghost_writer.jsonpath_extractor import JSONPathStructureAnalyzer
|
| 11 |
from my_ghost_writer.type_hints import TermRelationships, RelatedEntry
|
| 12 |
from tests import EVENTS_FOLDER
|
| 13 |
-
from tests.my_ghost_writer.helpers_tests import analyze_detailed_report_lists
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
class TestTextParsers2(unittest.TestCase):
|
|
@@ -83,18 +84,13 @@ class TestTextParsers2(unittest.TestCase):
|
|
| 83 |
|
| 84 |
def test_get_wordnet_synonyms(self):
|
| 85 |
# Test with a word that has known synonyms
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
detailed_report = analyzer.get_detailed_type_report()
|
| 94 |
-
analyze_detailed_report_lists(self, detailed_report, expected_detailed_report)
|
| 95 |
-
|
| 96 |
-
# with open(EVENTS_FOLDER / "get_wordnet_synonyms_piano_ok1.json", "w") as src:
|
| 97 |
-
# json.dump(detailed_report, src)
|
| 98 |
|
| 99 |
def test_get_wordnet_synonyms_custom_entry(self):
|
| 100 |
word = "happy"
|
|
|
|
| 10 |
from my_ghost_writer.jsonpath_extractor import JSONPathStructureAnalyzer
|
| 11 |
from my_ghost_writer.type_hints import TermRelationships, RelatedEntry
|
| 12 |
from tests import EVENTS_FOLDER
|
| 13 |
+
from tests.my_ghost_writer.helpers_tests import (analyze_detailed_report_lists,
|
| 14 |
+
assert__json_structure__get_wordnet_synonyms)
|
| 15 |
|
| 16 |
|
| 17 |
class TestTextParsers2(unittest.TestCase):
|
|
|
|
| 84 |
|
| 85 |
def test_get_wordnet_synonyms(self):
|
| 86 |
# Test with a word that has known synonyms
|
| 87 |
+
assert__json_structure__get_wordnet_synonyms(self, "piano")
|
| 88 |
+
|
| 89 |
+
def test_get_wordnet_synonyms_day(self):
|
| 90 |
+
assert__json_structure__get_wordnet_synonyms(self, "day")
|
| 91 |
+
|
| 92 |
+
def test_get_wordnet_synonyms_tense(self):
|
| 93 |
+
assert__json_structure__get_wordnet_synonyms(self, "tense")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
def test_get_wordnet_synonyms_custom_entry(self):
|
| 96 |
word = "happy"
|