Update README.md
Browse filesAdded the preprocess function used.
README.md
CHANGED
@@ -571,6 +571,35 @@ print(similarities.shape)
|
|
571 |
# [3, 3]
|
572 |
```
|
573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
574 |
<!--
|
575 |
### Direct Usage (Transformers)
|
576 |
|
|
|
571 |
# [3, 3]
|
572 |
```
|
573 |
|
574 |
+
Preprocess function:
|
575 |
+
```python
|
576 |
+
import re
|
577 |
+
import nltk
|
578 |
+
from nltk.corpus import stopwords
|
579 |
+
from nltk.stem import SnowballStemmer
|
580 |
+
from nltk.tokenize import word_tokenize
|
581 |
+
|
582 |
+
# Initialize Spanish stemmer and stopwords
|
583 |
+
nltk.download('punkt')
|
584 |
+
nltk.download('stopwords')
|
585 |
+
spanish_stopwords = set(stopwords.words('spanish'))
|
586 |
+
stemmer = SnowballStemmer('spanish')
|
587 |
+
|
588 |
+
# Function for preprocessing text (lowercase, remove punctuation, stopwords, and apply stemming)
|
589 |
+
def preprocess_text(text):
|
590 |
+
# Convert to lowercase
|
591 |
+
text = text.lower()
|
592 |
+
# Remove punctuation and special characters
|
593 |
+
text = re.sub(r'[^\w\s¿?%]', '', text)
|
594 |
+
# Tokenize
|
595 |
+
words = word_tokenize(text)
|
596 |
+
# Remove stopwords and apply stemming
|
597 |
+
words = [stemmer.stem(word) for word in words if word not in spanish_stopwords]
|
598 |
+
# Rejoin the words
|
599 |
+
return ' '.join(words)
|
600 |
+
```
|
601 |
+
|
602 |
+
|
603 |
<!--
|
604 |
### Direct Usage (Transformers)
|
605 |
|