|
|
import json |
|
|
import unittest |
|
|
from unittest.mock import patch, MagicMock |
|
|
|
|
|
from fastapi import HTTPException |
|
|
|
|
|
from my_ghost_writer.custom_synonym_handler import CustomSynonymHandler |
|
|
from my_ghost_writer.text_parsers2 import (extract_contextual_info_by_indices, get_wordnet_synonyms, inflect_synonym, |
|
|
is_nlp_available, process_synonym_groups) |
|
|
from my_ghost_writer.jsonpath_extractor import JSONPathStructureAnalyzer |
|
|
from my_ghost_writer.type_hints import TermRelationships, RelatedEntry |
|
|
from tests import EVENTS_FOLDER |
|
|
from tests.my_ghost_writer.helpers_tests import analyze_detailed_report_lists |
|
|
|
|
|
|
|
|
class TestTextParsers2(unittest.TestCase): |
|
|
@patch("my_ghost_writer.text_parsers2.nlp", new=MagicMock()) |
|
|
def test_is_nlp_available_ok(self): |
|
|
check = is_nlp_available() |
|
|
self.assertTrue(check) |
|
|
|
|
|
@patch("my_ghost_writer.text_parsers2.nlp", new=None) |
|
|
def test_is_nlp_available_fail(self): |
|
|
check = is_nlp_available() |
|
|
self.assertFalse(check) |
|
|
|
|
|
def test_extract_contextual_info_by_indices_valid(self): |
|
|
"""Tests valid context extraction using the real spaCy model.""" |
|
|
text = "The quick brown fox jumps over the lazy dog" |
|
|
start_idx, end_idx, target_word = 4, 9, "quick" |
|
|
|
|
|
result = extract_contextual_info_by_indices(text, start_idx, end_idx, target_word) |
|
|
|
|
|
self.assertEqual(result['word'], target_word) |
|
|
self.assertEqual(result['lemma'], "quick") |
|
|
self.assertEqual(result['pos'], "ADJ") |
|
|
self.assertEqual(result['tag'], "JJ") |
|
|
self.assertEqual(result['dependency'], "amod") |
|
|
self.assertEqual(result['context_sentence'], text) |
|
|
self.assertIn("context_words", result) |
|
|
self.assertEqual(result['char_start'], start_idx) |
|
|
self.assertEqual(result['char_end'], start_idx + len(target_word)) |
|
|
|
|
|
def test_extract_contextual_info_by_indices_invalid_indices(self): |
|
|
"""Tests that invalid indices raise a 400 HTTPException.""" |
|
|
with self.assertRaises(HTTPException) as context: |
|
|
extract_contextual_info_by_indices("Test text that raises a 400 ", 100, 200, "test") |
|
|
|
|
|
self.assertEqual(context.exception.status_code, 400) |
|
|
self.assertEqual(context.exception.detail, "Invalid start/end indices") |
|
|
|
|
|
@patch("my_ghost_writer.text_parsers2.nlp", new=None) |
|
|
def test_extract_contextual_info_nlp_unavailable(self): |
|
|
"""Tests that a 500 HTTPException is raised if spaCy model is not available.""" |
|
|
with self.assertRaises(HTTPException) as context: |
|
|
extract_contextual_info_by_indices("text", 0, 4, "text") |
|
|
|
|
|
self.assertEqual(context.exception.status_code, 500) |
|
|
self.assertEqual(context.exception.detail, "spaCy model not available") |
|
|
|
|
|
def test_extract_contextual_info_word_mismatch(self): |
|
|
"""Tests that a 400->500 HTTPException is raised for a word/index mismatch.""" |
|
|
text = "The quick brown fox" |
|
|
start_idx, end_idx, target_word = 4, 9, "brown" |
|
|
|
|
|
with self.assertRaises(HTTPException) as context: |
|
|
extract_contextual_info_by_indices(text, start_idx, end_idx, target_word) |
|
|
|
|
|
self.assertEqual(context.exception.status_code, 500) |
|
|
self.assertIn(f"Error analyzing context: 400: Could not find token for word '{target_word}' at indices {start_idx}-{end_idx}", context.exception.detail) |
|
|
|
|
|
@patch("my_ghost_writer.text_parsers2.nlp") |
|
|
def test_extract_contextual_info_word_none(self, nlp_mock): |
|
|
nlp_mock.return_value = [] |
|
|
text = "The quick brown fox jumps over the lazy dog" |
|
|
start_idx, end_idx, target_word = 4, 9, "quick" |
|
|
with self.assertRaises(HTTPException) as context: |
|
|
extract_contextual_info_by_indices(text, start_idx, end_idx, target_word) |
|
|
|
|
|
|
|
|
self.assertEqual(context.exception.status_code, 500) |
|
|
self.assertIn("Error analyzing context: 400: Could not find token for word 'quick' at indices 4-9", context.exception.detail) |
|
|
|
|
|
def test_get_wordnet_synonyms(self): |
|
|
|
|
|
with open(EVENTS_FOLDER / "get_wordnet_synonyms_piano_ok1.json", "r") as src: |
|
|
expected_detailed_report = json.load(src) |
|
|
word = "piano" |
|
|
related_words = get_wordnet_synonyms(word) |
|
|
first_related_words = related_words[0] |
|
|
analyzer = JSONPathStructureAnalyzer() |
|
|
analyzer.extract_all_paths(first_related_words) |
|
|
detailed_report = analyzer.get_detailed_type_report() |
|
|
analyze_detailed_report_lists(self, detailed_report, expected_detailed_report) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_get_wordnet_synonyms_custom_entry(self): |
|
|
word = "happy" |
|
|
pos = "ADJ" |
|
|
with open(EVENTS_FOLDER / "get_wordnet_synonyms_custom_entry_happy.json", "r") as src: |
|
|
expected_report_dict_list = json.load(src) |
|
|
|
|
|
related_word_groups = get_wordnet_synonyms(word, pos) |
|
|
self.assertIsInstance(related_word_groups[0]["examples"], list) |
|
|
for related_word_nth, expected_detailed_report_nth in zip(related_word_groups, expected_report_dict_list): |
|
|
del related_word_nth['examples'] |
|
|
analyzer = JSONPathStructureAnalyzer() |
|
|
analyzer.extract_all_paths(related_word_nth) |
|
|
detailed_report = analyzer.get_detailed_type_report(get_samples=False) |
|
|
self.assertDictEqual(detailed_report, expected_detailed_report_nth) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_get_wordnet_synonyms_pos_filter(self): |
|
|
|
|
|
word = "hunt" |
|
|
related_words_verbs = get_wordnet_synonyms(word, pos_tag="VERB") |
|
|
|
|
|
self.assertGreater(len(related_words_verbs), 0) |
|
|
for sense in related_words_verbs: |
|
|
self.assertEqual(sense['wordnet_pos'], 'v') |
|
|
|
|
|
@patch("my_ghost_writer.text_parsers2.wn.synsets") |
|
|
def test_get_wordnet_synonyms_generic_exception(self, mock_synsets): |
|
|
mock_synsets.side_effect = Exception("test exception") |
|
|
with self.assertRaises(HTTPException) as context: |
|
|
get_wordnet_synonyms("test", 'NOUN') |
|
|
|
|
|
|
|
|
self.assertEqual(context.exception.status_code, 500) |
|
|
self.assertIn("Error retrieving related words: 'test exception'", context.exception.detail) |
|
|
|
|
|
def test_inflect_synonym_noun_plural(self): |
|
|
|
|
|
original_token_info = {'pos': 'NOUN', 'tag': 'NNS', 'is_lower': True, 'is_title': False, 'is_upper': False} |
|
|
|
|
|
self.assertEqual(inflect_synonym("kid", original_token_info), "kids") |
|
|
|
|
|
self.assertEqual(inflect_synonym("child", original_token_info), "children") |
|
|
|
|
|
def test_inflect_synonym_verb_past1(self): |
|
|
|
|
|
original_token_info = { |
|
|
'word': 'looked', 'lemma': 'look', 'pos': 'VERB', 'tag': 'VBD', 'is_title': False, 'is_upper': False, |
|
|
'is_lower': True, 'dependency': 'ROOT', |
|
|
'context_sentence': 'He looked back at the whisperers as if he wanted to say something to them, but thought better of it.', |
|
|
'context_words': ['He', 'looked', 'back', 'at', 'the', 'whisperers', 'as'], 'sentence_position': 1, |
|
|
'char_start': 3, 'char_end': 9, 'original_indices': {'start': 3, 'end': 9} |
|
|
} |
|
|
result = inflect_synonym("write", original_token_info) |
|
|
self.assertEqual(result, "wrote") |
|
|
|
|
|
def test_inflect_synonym_verb_past2(self): |
|
|
"""Tests verb past tense inflection (VBD).""" |
|
|
original_token_info = {'pos': 'VERB', 'tag': 'VBD', 'is_lower': True, 'is_title': False, 'is_upper': False} |
|
|
self.assertEqual(inflect_synonym("write", original_token_info), "wrote") |
|
|
self.assertEqual(inflect_synonym("look", original_token_info), "looked") |
|
|
|
|
|
def test_inflect_synonym_verb_present_participle(self): |
|
|
"""Tests verb present participle inflection (VBG, e.g., 'writing').""" |
|
|
original_token_info = {'pos': 'VERB', 'tag': 'VBG', 'is_lower': True, 'is_title': False, 'is_upper': False} |
|
|
|
|
|
self.assertEqual(inflect_synonym("write", original_token_info), "writing") |
|
|
|
|
|
self.assertEqual(inflect_synonym("look", original_token_info), "looking") |
|
|
|
|
|
def test_inflect_synonym_verb_third_person_singular(self): |
|
|
"""Tests verb third-person singular inflection (VBZ, e.g., 'writes').""" |
|
|
original_token_info = {'pos': 'VERB', 'tag': 'VBZ', 'is_lower': True, 'is_title': False, 'is_upper': False} |
|
|
|
|
|
self.assertEqual(inflect_synonym("write", original_token_info), "writes") |
|
|
|
|
|
self.assertEqual(inflect_synonym("look", original_token_info), "looks") |
|
|
|
|
|
def test_inflect_synonym_adjective_comparative(self): |
|
|
"""Tests adjective comparative inflection (e.g., large -> larger) without mocks.""" |
|
|
|
|
|
original_token_info = { |
|
|
'word': 'bigger', |
|
|
'lemma': 'big', |
|
|
'pos': 'ADJ', |
|
|
'tag': 'JJR', |
|
|
'is_title': False, |
|
|
'is_upper': False, |
|
|
'is_lower': True, |
|
|
'dependency': 'acomp', |
|
|
'context_sentence': 'My house is bigger than yours.', |
|
|
'context_words': ['house', 'is', 'bigger', 'than', 'yours', '.'], |
|
|
'sentence_position': 3, |
|
|
'char_start': 12, |
|
|
'char_end': 18, |
|
|
'original_indices': {'start': 12, 'end': 18} |
|
|
} |
|
|
synonym_to_inflect = "large" |
|
|
|
|
|
|
|
|
result = inflect_synonym(synonym_to_inflect, original_token_info) |
|
|
|
|
|
|
|
|
self.assertEqual(result, "larger") |
|
|
|
|
|
def test_inflect_synonym_adjective_superlative(self): |
|
|
"""Tests adjective superlative inflection (e.g., large -> largest) without mocks.""" |
|
|
|
|
|
original_token_info = { |
|
|
'word': 'greatest', |
|
|
'lemma': 'great', |
|
|
'pos': 'ADJ', |
|
|
'tag': 'JJS', |
|
|
'is_title': False, |
|
|
'is_upper': False, |
|
|
'is_lower': True, |
|
|
'dependency': 'amod', |
|
|
'context_sentence': 'He is the greatest of all time.', |
|
|
'context_words': ['is', 'the', 'greatest', 'of', 'all', 'time', '.'], |
|
|
'sentence_position': 3, |
|
|
'char_start': 10, |
|
|
'char_end': 18, |
|
|
'original_indices': {'start': 10, 'end': 18} |
|
|
} |
|
|
synonym_to_inflect = "large" |
|
|
|
|
|
|
|
|
result = inflect_synonym(synonym_to_inflect, original_token_info) |
|
|
|
|
|
|
|
|
self.assertEqual(result, "largest") |
|
|
|
|
|
def test_inflect_synonym_is_title(self): |
|
|
"""Tests verb past tense inflection (VBD), is_title: True; set is_lower: False for good measure""" |
|
|
original_token_info = {'pos': 'VERB', 'tag': 'VBD', 'is_lower': False, 'is_title': True, 'is_upper': False} |
|
|
self.assertEqual(inflect_synonym("write", original_token_info), "Wrote") |
|
|
self.assertEqual(inflect_synonym("look", original_token_info), "Looked") |
|
|
|
|
|
def test_inflect_synonym_is_upper(self): |
|
|
"""Tests verb past tense inflection (VBD), is_upper: True; set is_lower: False for good measure""" |
|
|
original_token_info = {'pos': 'VERB', 'tag': 'VBD', 'is_lower': False, 'is_title': False, 'is_upper': True} |
|
|
self.assertEqual(inflect_synonym("write", original_token_info), "WROTE") |
|
|
self.assertEqual(inflect_synonym("look", original_token_info), "LOOKED") |
|
|
|
|
|
@patch("my_ghost_writer.text_parsers2.nlp", new=None) |
|
|
def test_inflect_synonym_nlp_none(self): |
|
|
result = inflect_synonym("test", {}) |
|
|
self.assertEqual(result, "test") |
|
|
|
|
|
@patch("my_ghost_writer.text_parsers2.nlp") |
|
|
def test_inflect_synonym_nlp_exception(self, nlp_mock): |
|
|
nlp_mock.side_effect = Exception("test exception") |
|
|
original_token_info = {'pos': 'VERB', 'tag': 'VBG', 'is_lower': True, 'is_title': False, 'is_upper': False} |
|
|
inflect_synonym("test", original_token_info) |
|
|
|
|
|
self.assertEqual(inflect_synonym("write", original_token_info), "write") |
|
|
|
|
|
self.assertEqual(inflect_synonym("look", original_token_info), "look") |
|
|
|
|
|
def test_inflect_synonym_verbs(self): |
|
|
"""Tests various verb inflections using subtests.""" |
|
|
test_cases = [ |
|
|
|
|
|
("VBD", "write", "wrote"), |
|
|
("VBD", "look", "looked"), |
|
|
("VBG", "write", "writing"), |
|
|
("VBG", "look", "looking"), |
|
|
("VBZ", "write", "writes"), |
|
|
("VBZ", "look", "looks"), |
|
|
] |
|
|
|
|
|
for tag, synonym, expected in test_cases: |
|
|
with self.subTest(tag=tag, synonym=synonym): |
|
|
original_token_info = { |
|
|
'pos': 'VERB', 'tag': tag, 'is_lower': True, 'is_title': False, 'is_upper': False |
|
|
} |
|
|
self.assertEqual(inflect_synonym(synonym, original_token_info), expected) |
|
|
|
|
|
def test_inflect_synonym_casing(self): |
|
|
"""Tests that casing is correctly applied during inflection.""" |
|
|
test_cases = [ |
|
|
|
|
|
(True, False, "write", "Wrote"), |
|
|
(False, True, "write", "WROTE"), |
|
|
(False, False, "look", "looked"), |
|
|
] |
|
|
for is_title, is_upper, synonym, expected in test_cases: |
|
|
with self.subTest(is_title=is_title, is_upper=is_upper): |
|
|
original_token_info = { |
|
|
'pos': 'VERB', 'tag': 'VBD', 'is_lower': not (is_title or is_upper), |
|
|
'is_title': is_title, 'is_upper': is_upper |
|
|
} |
|
|
self.assertEqual(inflect_synonym(synonym, original_token_info), expected) |
|
|
|
|
|
def test_process_synonym_groups(self): |
|
|
"""Tests the full processing pipeline for a verb.""" |
|
|
word = "look" |
|
|
context_info = {'char_end': 9, 'char_start': 3, |
|
|
'context_sentence': 'He looked back at the whisperers as if he wanted to say something to them, but thought better of it.', |
|
|
'context_words': ['He', 'looked', 'back', 'at', 'the', 'whisperers', 'as'], |
|
|
'dependency': 'ROOT', 'is_lower': True, 'is_title': False, 'is_upper': False, 'lemma': 'look', |
|
|
'original_indices': {'end': 9, 'start': 3}, 'pos': 'VERB', 'sentence_position': 1, 'tag': 'VBD', |
|
|
'word': 'looked'} |
|
|
|
|
|
result = process_synonym_groups(word, context_info) |
|
|
|
|
|
|
|
|
self.assertIsInstance(result, list) |
|
|
self.assertGreater(len(result), 0) |
|
|
|
|
|
first_sense = dict(result[0]) |
|
|
self.assertIn('definition', first_sense) |
|
|
self.assertIn('related_words', first_sense) |
|
|
|
|
|
first_synonym_info = dict(first_sense['related_words'][0]) |
|
|
self.assertIn('base_form', first_synonym_info) |
|
|
self.assertIn('inflected_form', first_synonym_info) |
|
|
|
|
|
self.assertNotEqual(first_synonym_info['base_form'], first_synonym_info['inflected_form']) |
|
|
|
|
|
def test_process_synonym_groups_custom_entry(self): |
|
|
word = "happy" |
|
|
context_info = { |
|
|
'char_end': 60, 'char_start': 55, |
|
|
'context_sentence': 'Even Muggles like yourself should be celebrating, this happy, happy day!"', |
|
|
'context_words': ['should', 'be', 'celebrating', ',', 'this', 'happy', ',', 'happy', 'day', '!', '"'], |
|
|
'dependency': 'amod', 'is_lower': True, 'is_title': False, 'is_upper': False, 'lemma': 'happy', |
|
|
'original_indices': {'end': 60, 'start': 55}, 'pos': 'ADJ', 'sentence_position': 9, |
|
|
'tag': 'JJ', 'word': 'happy' |
|
|
} |
|
|
|
|
|
result_related_word_groups_list = process_synonym_groups(word, context_info) |
|
|
self.assertIsInstance(result_related_word_groups_list, list) |
|
|
for related_words_group in result_related_word_groups_list: |
|
|
related_word_group_dict = dict(related_words_group) |
|
|
self.assertIsInstance(related_word_group_dict, dict) |
|
|
self.assertIsInstance(related_word_group_dict["definition"], str) |
|
|
self.assertIn("relation_type", related_word_group_dict) |
|
|
self.assertIn(related_word_group_dict["relation_type"], TermRelationships) |
|
|
self.assertIsInstance(related_word_group_dict["examples"], list) |
|
|
related_words = related_word_group_dict["related_words"] |
|
|
for _word_dict in related_words: |
|
|
word_dict = dict(_word_dict) |
|
|
self.assertIsInstance(word_dict, dict) |
|
|
self.assertIsInstance(word_dict["base_form"], str) |
|
|
self.assertIsInstance(word_dict["inflected_form"], str) |
|
|
self.assertIsInstance(word_dict["matches_context"], bool) |
|
|
self.assertIn("is_custom", word_dict) |
|
|
|
|
|
@patch("my_ghost_writer.text_parsers2.wn.synsets") |
|
|
def test_process_synonym_groups_not_synonyms_by_sense(self, mock_synsets): |
|
|
mock_synsets.return_value = [] |
|
|
context_info = {'pos': 'VERB', 'lemma': 'look'} |
|
|
result = process_synonym_groups("look", context_info) |
|
|
self.assertListEqual(result, []) |
|
|
|
|
|
@patch("my_ghost_writer.text_parsers2.custom_synonym_handler", new_callable=CustomSynonymHandler) |
|
|
def test_process_synonym_groups_includes_custom_entries(self, mock_handler): |
|
|
"""Tests that custom synonyms are correctly processed and included in the results.""" |
|
|
|
|
|
|
|
|
custom_entry = RelatedEntry( |
|
|
type=TermRelationships.SYNONYM, |
|
|
words=["gleeful", "elated"], |
|
|
definition="A custom definition for happy" |
|
|
) |
|
|
mock_handler.add_entry("happy", [custom_entry]) |
|
|
|
|
|
|
|
|
word = "happy" |
|
|
context_info = { |
|
|
'lemma': 'happy', 'pos': 'ADJ', 'tag': 'JJ', 'is_lower': True, |
|
|
'is_title': False, 'is_upper': False |
|
|
} |
|
|
|
|
|
|
|
|
result_groups = process_synonym_groups(word, context_info) |
|
|
|
|
|
|
|
|
|
|
|
custom_group = next((g for g in result_groups if g.related_words and g.related_words[0].is_custom), None) |
|
|
|
|
|
|
|
|
self.assertIsNotNone(custom_group, "A custom synonym group should have been found in the results.") |
|
|
self.assertEqual(custom_group.definition, "A custom definition for happy") |
|
|
self.assertEqual(custom_group.relation_type, TermRelationships.SYNONYM) |
|
|
|
|
|
|
|
|
custom_base_forms = {word.base_form for word in custom_group.related_words} |
|
|
self.assertEqual(custom_base_forms, {"gleeful", "elated"}) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
unittest.main() |
|
|
|