alessandro trinca tornidor commited on
Commit
7a541c8
·
1 Parent(s): fc3bbf3

refactor: avoid creating a PorterStemmer() instance every time text_stemming() runs

Browse files
my_ghost_writer/text_parsers.py CHANGED
@@ -1,9 +1,14 @@
1
  from typing import Iterator
2
 
 
 
3
  from my_ghost_writer.constants import app_logger
4
  from my_ghost_writer.type_hints import RequestTextRowsParentList, ResponseTextRowsDict
5
 
6
 
 
 
 
7
  def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict:
8
  """
9
  Applies Porter Stemmer algorithm to reduce words in a given text to their base form;
@@ -17,10 +22,8 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
17
  tuple[int, dict]: a tuple with the number of processed total rows within the initial text and the words frequency dict
18
  """
19
  import json
20
- from nltk import PorterStemmer
21
  from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
22
-
23
- ps = PorterStemmer()
24
  try:
25
  valid_textrows_with_num = json.loads(text)
26
  app_logger.info("valid_textrows_with_num::json")
@@ -51,13 +54,13 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
51
  idx_rows_parent.append(None)
52
  row_words_tokens.append(wordpunct_tokenize(row))
53
  row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
54
- words_stems_dict = get_words_tokens_and_indexes(row_words_tokens, row_offsets_tokens, ps, idx_rows, idx_rows_child, idx_rows_parent)
55
  n_total_rows = len(valid_textrows_with_num)
56
  return n_total_rows, words_stems_dict
57
 
58
 
59
  def get_words_tokens_and_indexes(
60
- words_tokens_list: list[str], offsets_tokens_list: list | Iterator, ps, idx_rows_list: list[int], idx_rows_child: list[int], idx_rows_parent: list[int]
61
  ) -> dict:
62
  """
63
  Get the word tokens and their indexes in the text.
 
1
  from typing import Iterator
2
 
3
+ from nltk import PorterStemmer
4
+
5
  from my_ghost_writer.constants import app_logger
6
  from my_ghost_writer.type_hints import RequestTextRowsParentList, ResponseTextRowsDict
7
 
8
 
9
+ ps = PorterStemmer()
10
+
11
+
12
  def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict:
13
  """
14
  Applies Porter Stemmer algorithm to reduce words in a given text to their base form;
 
22
  tuple[int, dict]: a tuple with the number of processed total rows within the initial text and the words frequency dict
23
  """
24
  import json
 
25
  from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
26
+
 
27
  try:
28
  valid_textrows_with_num = json.loads(text)
29
  app_logger.info("valid_textrows_with_num::json")
 
54
  idx_rows_parent.append(None)
55
  row_words_tokens.append(wordpunct_tokenize(row))
56
  row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
57
+ words_stems_dict = get_words_tokens_and_indexes(row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent)
58
  n_total_rows = len(valid_textrows_with_num)
59
  return n_total_rows, words_stems_dict
60
 
61
 
62
  def get_words_tokens_and_indexes(
63
+ words_tokens_list: list[str], offsets_tokens_list: list | Iterator, idx_rows_list: list[int], idx_rows_child: list[int], idx_rows_parent: list[int]
64
  ) -> dict:
65
  """
66
  Get the word tokens and their indexes in the text.
tests/test_text_parsers.py CHANGED
@@ -44,7 +44,6 @@ class TestTextParsers(unittest.TestCase):
44
  words_stems_dict = get_words_tokens_and_indexes(
45
  row_words_tokens,
46
  row_offsets_tokens,
47
- self.ps,
48
  idx_rows,
49
  idx_rows_child,
50
  idx_rows_parent
 
44
  words_stems_dict = get_words_tokens_and_indexes(
45
  row_words_tokens,
46
  row_offsets_tokens,
 
47
  idx_rows,
48
  idx_rows_child,
49
  idx_rows_parent