## 导入库

In [1]:
import re
import nltk
from nltk.corpus.reader.tagged import ToktokTokenizer
from bs4 import BeautifulSoup
import pandas as pd
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

## 读取数据

In [2]:
df = pd.read_csv('/kaggle/input/emotion-analysis-based-on-text/emotion_sentimen_dataset.csv', encoding='utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,text,Emotion
0,0,i seriously hate one subject to death but now ...,hate
1,1,im so full of life i feel appalled,neutral
2,2,i sit here to write i start to dig out my feel...,neutral
3,3,ive been really angry with r and i feel like a...,anger
4,4,i feel suspicious if there is no one outside l...,neutral


## 数据预处理

### 去掉'Unnamed:0'列

In [3]:
if 'Unnamed: 0' in df.columns:
    del df['Unnamed: 0']
df.loc[0]['text']

'i seriously hate one subject to death but now i feel reluctant to drop it'

### 检查缺失值

In [4]:
df.isnull().any()

text       False
Emotion    False
dtype: bool

没有缺失值

### 统计标签个数

In [5]:
df['Emotion'].value_counts()

Emotion
neutral       674538
love           39553
happiness      27175
sadness        17481
relief         16729
hate           15267
anger          12336
fun            10075
enthusiasm      9304
surprise        6954
empty           5542
worry           4475
boredom          126
Name: count, dtype: int64

neutral情绪居多

### 过滤HTML

In [6]:
def noiseremovel_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'\[[^]]*\]', '', text)    
    return text

In [7]:
sample_text = '<div>I really enjoyed the latest episode of my favorite show! [https://t.co/xyz123] Check out this link for a recap. #bestshowever [Ad: Stream now on MyStreamingService for 50% off!]</div>'
trans_sample_text = noiseremovel_text(sample_text)
trans_sample_text

'I really enjoyed the latest episode of my favorite show!  Check out this link for a recap. #bestshowever '

In [8]:
df['text'] = df['text'].apply(noiseremovel_text)

### 移除stopwords

In [9]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stop_wr = set(stopwords)

def remove_stopwords(text, stop_words):
   tokenizers = ToktokTokenizer()
   #提取单词和缩写
   words = re.findall(r'\w+|\.\.+', text)
   stop_words = set(stop_words)
   filtokens = [i for i in words if i.lower() not in stop_words]
   # 连接
   filtered_text = ' '.join(filtokens)
   return filtered_text

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
sample_text = 'i seriously hate one subject to death but now i feel reluctant to drop it'
trans_sample_text = remove_stopwords(sample_text, stop_wr)
trans_sample_text

'seriously hate one subject death feel reluctant drop'

In [11]:
df['text'] = df['text'].apply(remove_stopwords, stop_words=stop_wr)

## 对情绪标签编码

In [12]:
label_binarizer = LabelBinarizer()

emotion_encoded = label_binarizer.fit_transform(df['Emotion'])
emotion_encoded.shape

(839555, 13)

## 文本特征向量化

In [13]:
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['text'])

X_train, X_test, y_train, y_test = train_test_split(X_bow, df['Emotion'], test_size=0.2, random_state=42)

logistic = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

## 训练

In [14]:
lr_bow = logistic.fit(X_train, y_train)

## 保存

In [15]:
joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(lr_bow, 'model.joblib')

loaded_vectorizer = joblib.load('vectorizer.joblib')
loaded_model = joblib.load('model.joblib')

## 评估

In [16]:
# 评估
y_pred = loaded_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9983324499288313
Classification Report:
              precision    recall  f1-score   support

       anger       1.00      0.99      0.99      2489
     boredom       1.00      0.95      0.98        21
       empty       1.00      0.98      0.99      1096
  enthusiasm       1.00      0.99      1.00      1839
         fun       1.00      0.98      0.99      1977
   happiness       1.00      1.00      1.00      5370
        hate       1.00      0.99      1.00      3018
        love       1.00      1.00      1.00      8001
     neutral       1.00      1.00      1.00    134999
      relief       1.00      0.98      0.99      3396
     sadness       1.00      0.99      0.99      3428
    surprise       1.00      0.99      1.00      1372
       worry       1.00      0.99      1.00       905

    accuracy                           1.00    167911
   macro avg       1.00      0.99      0.99    167911
weighted avg       1.00      1.00      1.00    167911

