mgw / tests /test_text_parsers.py
alessandro trinca tornidor
feat: add backend support for multi-word stemming with test cases
000cfc3
raw
history blame
9.67 kB
import json
import unittest
import nltk
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from tests import EVENTS_FOLDER
def get_inputs_for(valid_textrows_with_num):
row_words_tokens = []
row_offsets_tokens = []
idx_rows = []
idx_rows_child = []
idx_rows_parent = []
rows_dict = {}
for textrow in valid_textrows_with_num:
row = textrow["text"]
idx_row = textrow["idxRow"]
rows_dict[idx_row] = row
idx_rows.append(idx_row)
try:
idx_rows_child.append(textrow["idxRowChild"])
idx_rows_parent.append(textrow["idxRowParent"])
except KeyError:
idx_rows_child.append(None)
idx_rows_parent.append(None)
row_words_tokens.append(wordpunct_tokenize(row))
row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
return row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent, rows_dict
class TestTextParsers(unittest.TestCase):
def setUp(self):
with open(EVENTS_FOLDER / "get_words_tokens_and_indexes_inputs.json", "r") as src:
self.get_words_tokens_and_indexes_inputs = json.load(src)
with open(EVENTS_FOLDER / "request_text_stemming_no_parents.json", "r") as src:
self.text_json_list_no_parents = json.load(src)
with open(EVENTS_FOLDER / "request_text_stemming_with_parents.json", "r") as src:
self.text_json_list_with_parents = json.load(src)
with open(EVENTS_FOLDER / "llm_generated_story_3.txt", "r") as src:
self.original_text = src.read()
self.text_split_newline = self.original_text.split("\n")
self.row_words_tokens = []
self.row_offsets_tokens = []
for row in self.text_split_newline:
self.row_words_tokens.append(wordpunct_tokenize(row))
self.row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('averaged_perceptron_tagger_eng')
self.ps = nltk.PorterStemmer()
self.wnl = nltk.WordNetLemmatizer()
def test_text_stemming_text(self):
from my_ghost_writer.text_parsers import text_stemming
self.maxDiff = None
n_total_rows, words_stems_dict = text_stemming(self.original_text)
self.assertEqual(n_total_rows, len(self.text_split_newline))
# with open(EVENTS_FOLDER / "response_text_stemming_from_llm_generated_story_3.json", "w") as dst_json:
# json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
# pass
with open(EVENTS_FOLDER / "response_text_stemming_from_llm_generated_story_3.json", "r") as dst_json:
response_text_stemming_from_llm_generated_story_3 = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_from_llm_generated_story_3["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_text_stemming_input_str_json(self):
from my_ghost_writer.text_parsers import text_stemming
self.maxDiff = None
json_str = json.dumps(self.text_json_list_no_parents)
n_total_rows, words_stems_dict = text_stemming(json_str)
self.assertEqual(n_total_rows, len(self.text_json_list_no_parents))
# with open(EVENTS_FOLDER / "response_text_stemming_empty_rows.json", "w") as dst_json:
# json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
# pass
with open(EVENTS_FOLDER / "response_text_stemming_empty_rows.json", "r") as dst_json:
response_text_stemming_empty_rows = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_empty_rows["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_text_stemming_list_no_parents(self):
from my_ghost_writer.text_parsers import text_stemming
self.maxDiff = None
n_total_rows, words_stems_dict = text_stemming(self.text_json_list_no_parents)
self.assertEqual(n_total_rows, len(self.text_json_list_no_parents))
# with open(EVENTS_FOLDER / "response_text_stemming_no_parents.json", "w") as dst_json:
# json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
# pass
with open(EVENTS_FOLDER / "response_text_stemming_no_parents.json", "r") as dst_json:
response_text_stemming_no_parents = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_no_parents["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_text_stemming_list_with_parents(self):
from my_ghost_writer.text_parsers import text_stemming
self.maxDiff = None
n_total_rows, words_stems_dict = text_stemming(self.text_json_list_with_parents, n=3)
self.assertEqual(n_total_rows, len(self.text_json_list_with_parents))
# with open(EVENTS_FOLDER / "response_text_stemming_with_parents.json", "w") as dst_json:
# json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
# pass
with open(EVENTS_FOLDER / "response_text_stemming_with_parents.json", "r") as dst_json:
response_text_stemming_with_parents = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_with_parents["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_text_stemming_wrong_input(self):
from my_ghost_writer.text_parsers import text_stemming
with self.assertRaises(TypeError):
try:
text_stemming({"text": "This is a test."})
except TypeError as e:
self.assertEqual(str(e), "Invalid input type. Expected plain text str, json str or list of dictionaries, not '<class 'dict'>'.")
raise e
def test_update_stems_list(self):
from my_ghost_writer.text_parsers import update_stems_list
with open(EVENTS_FOLDER / "update_stem_list_inputs.json", "r") as src:
test_args = json.load(src)
test_args_inputs = test_args["input"]
test_args_outputs = test_args["output"]
for arg, expected in zip(test_args_inputs, test_args_outputs):
n_row = arg['n_row']
offsets = arg['offsets']
word = arg['word']
current_stem_tuple = arg['current_stem_tuple']
n_row_child = arg["n_row_child"]
n_row_parent = arg["n_row_parent"]
expected_offsets_array = expected['offsets_array']
expected_count = expected['count']
count, word_offsets = update_stems_list(current_stem_tuple, word, offsets, n_row=n_row, n_row_child=n_row_child, n_row_parent=n_row_parent)
self.assertEqual(count, expected_count)
self.assertEqual(word_offsets, expected_offsets_array)
def test_get_words_tokens_and_indexes_ngrams_no_parents(self):
from my_ghost_writer.text_parsers import get_words_tokens_and_indexes_ngrams
with open(EVENTS_FOLDER / "llm_generated_story_4.txt", "r") as src:
text = src.read()
valid_textrows_with_num = [{"idxRow": i, "text": row} for i, row in enumerate(text.split("\n"))]
row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent, rows_dict = get_inputs_for(
valid_textrows_with_num
)
words_stems_dict = get_words_tokens_and_indexes_ngrams(
row_words_tokens,
row_offsets_tokens,
idx_rows,
idx_rows_child,
idx_rows_parent,
rows_dict=rows_dict,
n=5
)
# with open(EVENTS_FOLDER / "response_get_words_tokens_and_indexes_ngrams_text4_n5.json", "w") as dst_json:
# json.dump({"words_stems_dict": words_stems_dict}, dst_json, indent=2)
with open(EVENTS_FOLDER / "response_get_words_tokens_and_indexes_ngrams_text4_n5.json", "r") as dst_json:
response_get_words_tokens_and_indexes_ngrams_text4_n5 = json.load(dst_json)
expected_words_stems_dict = response_get_words_tokens_and_indexes_ngrams_text4_n5["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_get_words_tokens_and_indexes_ngrams_with_parents(self):
from my_ghost_writer.text_parsers import get_words_tokens_and_indexes_ngrams
self.maxDiff = None
row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent, rows_dict = get_inputs_for(
self.text_json_list_with_parents
)
words_stems_dict = get_words_tokens_and_indexes_ngrams(
row_words_tokens,
row_offsets_tokens,
idx_rows,
idx_rows_child,
idx_rows_parent,
rows_dict=rows_dict,
n=3
)
with open(EVENTS_FOLDER / "response_text_stemming_with_parents.json", "r") as dst_json:
response_text_stemming_with_parents = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_with_parents["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
if __name__ == "__main__":
unittest.main()