Spaces:

aus10powell
/

TwitterAccounts

Runtime error

App Files Files Community

TwitterAccounts / scripts /sentiment.py

aus10powell

Upload 74 files

8158335 over 2 years ago

raw

history blame

3 kB

	import re
	import nltk
	from typing import List
	from transformers import pipeline
	from tqdm import tqdm


	def tweet_cleaner(tweet: str) -> str:
	# words = set(nltk.corpus.words.words())
	"""
	Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words.

	Args:
	tweet (str): A single tweet as a string.

	Returns:
	str: The cleaned tweet.
	"""
	# Remove @ mentions from the tweet
	# tweet = re.sub("@[A-Za-z0-9]+", "", tweet)

	# # Remove URLs from the tweet
	# tweet = re.sub(r"(?:\@\|http?\://\|https?\://\|www)\S+", "", tweet)

	# # Remove extra whitespaces from the tweet
	# tweet = " ".join(tweet.split())

	# # Remove hashtag sign but keep the text
	# tweet = tweet.replace("#", "").replace("_", " ")

	# # Tokenize the tweet and keep only valid words
	# tweet = " ".join(
	# w
	# for w in nltk.wordpunct_tokenize(tweet)
	# if w.lower() in words or not w.isalpha()
	# )

	# # Return the cleaned tweet
	# return tweet
	bad_start = ["http:", "https:"]
	for w in bad_start:
	tweet = re.sub(f" {w}\\S+", "", tweet) # removes white space before url
	tweet = re.sub(f"{w}\\S+ ", "", tweet) # in case a tweet starts with a url
	tweet = re.sub(f"\n{w}\\S+ ", "", tweet) # in case the url is on a new line
	tweet = re.sub(
	f"\n{w}\\S+", "", tweet
	) # in case the url is alone on a new line
	tweet = re.sub(f"{w}\\S+", "", tweet) # any other case?
	tweet = re.sub(" +", " ", tweet) # replace multiple spaces with one space
	return " ".join(tweet.split()).strip()


	def is_boring_tweet(tweet):
	"""Check if tweet is boring."""
	boring_stuff = ["http", "@", "#"]
	not_boring_words = sum(
	1
	for word in tweet.split()
	if not any(bs in word.lower() for bs in boring_stuff)
	)
	return not_boring_words < 3


	def fix_text(text):
	text = text.replace("&", "&")
	text = text.replace("<", "<")
	text = text.replace(">", ">")
	return text


	def get_tweets_sentiment(tweets: List[str]) -> List[float]:
	"""
	Takes in a list of tweet texts and returns their sentiment scores as a list of floats between 0 and 1.

	Parameters:
	tweets (List[str]): A list of tweet texts.

	Returns:
	List[float]: A list of sentiment scores for the input tweets, where each score is a float between 0 and 1.
	"""

	# Load the sentiment analysis pipeline
	classifier = pipeline(
	"sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english"
	)

	if type(tweets[0]) == dict:
	# Clean tweets
	tweet_texts = [tweet_cleaner(t["content"]) for t in tqdm(tweets)]
	else:
	tweet_texts = [tweet_cleaner(t) for t in tqdm(tweets)]

	# Get tweet sentiment score
	tweet_sentiments = classifier(tweet_texts)

	# Extract the sentiment score from each result and return as a list
	return [t["score"] for t in tweet_sentiments]