File size: 3,827 Bytes
e0362b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38afa8f
 
 
 
 
 
 
 
 
 
 
 
bdf96a7
 
 
 
 
 
 
 
 
 
 
 
38afa8f
 
bdf96a7
 
e0362b5
f65bbaf
e0362b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from typing import Iterator


def clean_string(s: str) -> str:
    """
    Clean a given string by removing punctuation using
    1. nltk.classify.TextCat()'s remove_punctuation() method
    2. removing new line characters
    and converting the string to lowercase.

    Args:
        s (str): The string to clean.

    Returns:
        str: The cleaned string.
    """
    from nltk.classify import TextCat
    tc = TextCat()
    cleaned_word = tc.remove_punctuation(text=s)
    return cleaned_word.translate(str.maketrans("", "", "\n\r"))


def text_stemming(text) -> tuple[int, dict]:
    """
    Applies Porter Stemmer algorithm to reduce words in a given text to their base form,
    then it uses WordPunctTokenizer() to produce a dict of words frequency with, for
    every recognized base form, a list of these repeated words with their position.

    Args:
        text (str): Input string containing the text to be stemmed.

    Returns:
        tuple[int, dict]: a tuple with the number of processed total rows within the initial text and the words frequency dict
    """
    from nltk import PorterStemmer
    from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
    from my_ghost_writer.text_parsers import get_words_tokens_and_indexes
    
    ps = PorterStemmer()
    text_split_newline = text.split("\n")
    row_words_tokens = []
    row_offsets_tokens = []
    for row in text_split_newline:
        row_words_tokens.append(wordpunct_tokenize(row))
        row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
    words_stems_dict = get_words_tokens_and_indexes(row_words_tokens, row_offsets_tokens, ps)
    n_total_rows = len(text_split_newline)
    return n_total_rows, words_stems_dict


def get_words_tokens_and_indexes(
        words_tokens_list: list[str], offsets_tokens_list: list | Iterator, ps, min_len_words=3
    ) -> dict:
    """
    Get the words tokens and their indexes in the text.

    Args:
        words_tokens_list (list): List of words tokens.
        offsets_tokens_list (list): List of offsets for each token.
        ps (PorterStemmer): The stemmer to use.
        min_len_words (int): Minimum length of words to include.

    Returns:
        dict: Dictionary with stemmed words as keys and a list of dictionaries
              containing the original word and its offsets as values.
    """
    words_stems_dict = {}
    for n_row, (words_tokens, offsets_tokens) in enumerate(zip(words_tokens_list, offsets_tokens_list)):
        for word, offsets in zip(words_tokens, offsets_tokens):
            cleaned_word = clean_string(word)
            if len(cleaned_word) < min_len_words:
                continue
            stem = ps.stem(word)
            if stem not in words_stems_dict:
                words_stems_dict[stem] = {"count": 0, "word_prefix": stem, "offsets_array": []}
            count, word_offsets = update_stems_list(words_stems_dict[stem], word, offsets, n_row=n_row)
            words_stems_dict[stem] = {"count": count, "word_prefix": stem, "offsets_array": word_offsets}
    return words_stems_dict


def update_stems_list(current_stem_tuple: dict, word: str, offsets: list, n_row: int) -> tuple:
    """
    Update the stems list with the new stem and its count.

    Args:
        current_stem_tuple (tuple): Tuple containing the current stem count and list of words.
        offsets (list): List of offsets for the word.
        word (str): The word to stem.
        n_row (int): The row number in the original text.

    Returns:
        dict[str|list|int]: A dictionary with the stem string, its offsets and count.
    """
    n, word_offsets = current_stem_tuple["count"], current_stem_tuple["offsets_array"]
    n += 1
    word_offsets.append({"word": word, "offsets": list(offsets), "n_row": n_row})
    return n, word_offsets