igmochang commited on
Commit
0bfe83e
·
verified ·
1 Parent(s): 4cd873d

Update README.md

Browse files

Added the preprocess function used.

Files changed (1) hide show
  1. README.md +29 -0
README.md CHANGED
@@ -571,6 +571,35 @@ print(similarities.shape)
571
  # [3, 3]
572
  ```
573
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  <!--
575
  ### Direct Usage (Transformers)
576
 
 
571
  # [3, 3]
572
  ```
573
 
574
+ Preprocess function:
575
+ ```python
576
+ import re
577
+ import nltk
578
+ from nltk.corpus import stopwords
579
+ from nltk.stem import SnowballStemmer
580
+ from nltk.tokenize import word_tokenize
581
+
582
+ # Initialize Spanish stemmer and stopwords
583
+ nltk.download('punkt')
584
+ nltk.download('stopwords')
585
+ spanish_stopwords = set(stopwords.words('spanish'))
586
+ stemmer = SnowballStemmer('spanish')
587
+
588
+ # Function for preprocessing text (lowercase, remove punctuation, stopwords, and apply stemming)
589
+ def preprocess_text(text):
590
+ # Convert to lowercase
591
+ text = text.lower()
592
+ # Remove punctuation and special characters
593
+ text = re.sub(r'[^\w\s¿?%]', '', text)
594
+ # Tokenize
595
+ words = word_tokenize(text)
596
+ # Remove stopwords and apply stemming
597
+ words = [stemmer.stem(word) for word in words if word not in spanish_stopwords]
598
+ # Rejoin the words
599
+ return ' '.join(words)
600
+ ```
601
+
602
+
603
  <!--
604
  ### Direct Usage (Transformers)
605