Spaces:
Runtime error
Runtime error
| import re | |
| import nltk | |
| from typing import List | |
| from transformers import pipeline | |
| from tqdm import tqdm | |
| def tweet_cleaner(tweet: str) -> str: | |
| # words = set(nltk.corpus.words.words()) | |
| """ | |
| Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words. | |
| Args: | |
| tweet (str): A single tweet as a string. | |
| Returns: | |
| str: The cleaned tweet. | |
| """ | |
| # Remove @ mentions from the tweet | |
| # tweet = re.sub("@[A-Za-z0-9]+", "", tweet) | |
| # # Remove URLs from the tweet | |
| # tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) | |
| # # Remove extra whitespaces from the tweet | |
| # tweet = " ".join(tweet.split()) | |
| # # Remove hashtag sign but keep the text | |
| # tweet = tweet.replace("#", "").replace("_", " ") | |
| # # Tokenize the tweet and keep only valid words | |
| # tweet = " ".join( | |
| # w | |
| # for w in nltk.wordpunct_tokenize(tweet) | |
| # if w.lower() in words or not w.isalpha() | |
| # ) | |
| # # Return the cleaned tweet | |
| # return tweet | |
| bad_start = ["http:", "https:"] | |
| for w in bad_start: | |
| tweet = re.sub(f" {w}\\S+", "", tweet) # removes white space before url | |
| tweet = re.sub(f"{w}\\S+ ", "", tweet) # in case a tweet starts with a url | |
| tweet = re.sub(f"\n{w}\\S+ ", "", tweet) # in case the url is on a new line | |
| tweet = re.sub( | |
| f"\n{w}\\S+", "", tweet | |
| ) # in case the url is alone on a new line | |
| tweet = re.sub(f"{w}\\S+", "", tweet) # any other case? | |
| tweet = re.sub(" +", " ", tweet) # replace multiple spaces with one space | |
| return " ".join(tweet.split()).strip() | |
| def is_boring_tweet(tweet): | |
| """Check if tweet is boring.""" | |
| boring_stuff = ["http", "@", "#"] | |
| not_boring_words = sum( | |
| 1 | |
| for word in tweet.split() | |
| if not any(bs in word.lower() for bs in boring_stuff) | |
| ) | |
| return not_boring_words < 3 | |
| def fix_text(text): | |
| text = text.replace("&", "&") | |
| text = text.replace("<", "<") | |
| text = text.replace(">", ">") | |
| return text | |
| def get_tweets_sentiment(tweets: List[str]) -> List[float]: | |
| """ | |
| Takes in a list of tweet texts and returns their sentiment scores as a list of floats between 0 and 1. | |
| Parameters: | |
| tweets (List[str]): A list of tweet texts. | |
| Returns: | |
| List[float]: A list of sentiment scores for the input tweets, where each score is a float between 0 and 1. | |
| """ | |
| # Load the sentiment analysis pipeline | |
| classifier = pipeline( | |
| "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english" | |
| ) | |
| if type(tweets[0]) == dict: | |
| # Clean tweets | |
| tweet_texts = [tweet_cleaner(t["content"]) for t in tqdm(tweets)] | |
| else: | |
| tweet_texts = [tweet_cleaner(t) for t in tqdm(tweets)] | |
| # Get tweet sentiment score | |
| tweet_sentiments = classifier(tweet_texts) | |
| # Extract the sentiment score from each result and return as a list | |
| return [t["score"] for t in tweet_sentiments] | |