alessandro trinca tornidor
commited on
Commit
·
7a541c8
1
Parent(s):
fc3bbf3
refactor: avoid creating a PorterStemmer() instance every time text_stemming() runs
Browse files
my_ghost_writer/text_parsers.py
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
from typing import Iterator
|
| 2 |
|
|
|
|
|
|
|
| 3 |
from my_ghost_writer.constants import app_logger
|
| 4 |
from my_ghost_writer.type_hints import RequestTextRowsParentList, ResponseTextRowsDict
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict:
|
| 8 |
"""
|
| 9 |
Applies Porter Stemmer algorithm to reduce words in a given text to their base form;
|
|
@@ -17,10 +22,8 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
|
|
| 17 |
tuple[int, dict]: a tuple with the number of processed total rows within the initial text and the words frequency dict
|
| 18 |
"""
|
| 19 |
import json
|
| 20 |
-
from nltk import PorterStemmer
|
| 21 |
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
|
| 22 |
-
|
| 23 |
-
ps = PorterStemmer()
|
| 24 |
try:
|
| 25 |
valid_textrows_with_num = json.loads(text)
|
| 26 |
app_logger.info("valid_textrows_with_num::json")
|
|
@@ -51,13 +54,13 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
|
|
| 51 |
idx_rows_parent.append(None)
|
| 52 |
row_words_tokens.append(wordpunct_tokenize(row))
|
| 53 |
row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
|
| 54 |
-
words_stems_dict = get_words_tokens_and_indexes(row_words_tokens, row_offsets_tokens,
|
| 55 |
n_total_rows = len(valid_textrows_with_num)
|
| 56 |
return n_total_rows, words_stems_dict
|
| 57 |
|
| 58 |
|
| 59 |
def get_words_tokens_and_indexes(
|
| 60 |
-
words_tokens_list: list[str], offsets_tokens_list: list | Iterator,
|
| 61 |
) -> dict:
|
| 62 |
"""
|
| 63 |
Get the word tokens and their indexes in the text.
|
|
|
|
| 1 |
from typing import Iterator
|
| 2 |
|
| 3 |
+
from nltk import PorterStemmer
|
| 4 |
+
|
| 5 |
from my_ghost_writer.constants import app_logger
|
| 6 |
from my_ghost_writer.type_hints import RequestTextRowsParentList, ResponseTextRowsDict
|
| 7 |
|
| 8 |
|
| 9 |
+
ps = PorterStemmer()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict:
|
| 13 |
"""
|
| 14 |
Applies Porter Stemmer algorithm to reduce words in a given text to their base form;
|
|
|
|
| 22 |
tuple[int, dict]: a tuple with the number of processed total rows within the initial text and the words frequency dict
|
| 23 |
"""
|
| 24 |
import json
|
|
|
|
| 25 |
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
|
| 26 |
+
|
|
|
|
| 27 |
try:
|
| 28 |
valid_textrows_with_num = json.loads(text)
|
| 29 |
app_logger.info("valid_textrows_with_num::json")
|
|
|
|
| 54 |
idx_rows_parent.append(None)
|
| 55 |
row_words_tokens.append(wordpunct_tokenize(row))
|
| 56 |
row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
|
| 57 |
+
words_stems_dict = get_words_tokens_and_indexes(row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent)
|
| 58 |
n_total_rows = len(valid_textrows_with_num)
|
| 59 |
return n_total_rows, words_stems_dict
|
| 60 |
|
| 61 |
|
| 62 |
def get_words_tokens_and_indexes(
|
| 63 |
+
words_tokens_list: list[str], offsets_tokens_list: list | Iterator, idx_rows_list: list[int], idx_rows_child: list[int], idx_rows_parent: list[int]
|
| 64 |
) -> dict:
|
| 65 |
"""
|
| 66 |
Get the word tokens and their indexes in the text.
|
tests/test_text_parsers.py
CHANGED
|
@@ -44,7 +44,6 @@ class TestTextParsers(unittest.TestCase):
|
|
| 44 |
words_stems_dict = get_words_tokens_and_indexes(
|
| 45 |
row_words_tokens,
|
| 46 |
row_offsets_tokens,
|
| 47 |
-
self.ps,
|
| 48 |
idx_rows,
|
| 49 |
idx_rows_child,
|
| 50 |
idx_rows_parent
|
|
|
|
| 44 |
words_stems_dict = get_words_tokens_and_indexes(
|
| 45 |
row_words_tokens,
|
| 46 |
row_offsets_tokens,
|
|
|
|
| 47 |
idx_rows,
|
| 48 |
idx_rows_child,
|
| 49 |
idx_rows_parent
|