{ "cells": [ { "cell_type": "markdown", "id": "988dc733", "metadata": { "papermill": { "duration": 0.007087, "end_time": "2024-06-07T07:59:47.120323", "exception": false, "start_time": "2024-06-07T07:59:47.113236", "status": "completed" }, "tags": [] }, "source": [ "## 导入库" ] }, { "cell_type": "code", "execution_count": 1, "id": "e45a4da1", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T07:59:47.134597Z", "iopub.status.busy": "2024-06-07T07:59:47.134196Z", "iopub.status.idle": "2024-06-07T07:59:49.742368Z", "shell.execute_reply": "2024-06-07T07:59:49.741337Z" }, "papermill": { "duration": 2.618114, "end_time": "2024-06-07T07:59:49.744905", "exception": false, "start_time": "2024-06-07T07:59:47.126791", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import re\n", "import nltk\n", "from nltk.corpus.reader.tagged import ToktokTokenizer\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import joblib\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.preprocessing import LabelBinarizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "markdown", "id": "521ea494", "metadata": { "papermill": { "duration": 0.006048, "end_time": "2024-06-07T07:59:49.757454", "exception": false, "start_time": "2024-06-07T07:59:49.751406", "status": "completed" }, "tags": [] }, "source": [ "## 读取数据" ] }, { "cell_type": "code", "execution_count": 2, "id": "cf510c24", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T07:59:49.771974Z", "iopub.status.busy": "2024-06-07T07:59:49.770991Z", "iopub.status.idle": "2024-06-07T07:59:52.074581Z", "shell.execute_reply": "2024-06-07T07:59:52.073532Z" }, "papermill": { "duration": 2.313203, "end_time": "2024-06-07T07:59:52.076814", "exception": false, "start_time": "2024-06-07T07:59:49.763611", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0textEmotion
00i seriously hate one subject to death but now ...hate
11im so full of life i feel appalledneutral
22i sit here to write i start to dig out my feel...neutral
33ive been really angry with r and i feel like a...anger
44i feel suspicious if there is no one outside l...neutral
\n", "
" ], "text/plain": [ " Unnamed: 0 text Emotion\n", "0 0 i seriously hate one subject to death but now ... hate\n", "1 1 im so full of life i feel appalled neutral\n", "2 2 i sit here to write i start to dig out my feel... neutral\n", "3 3 ive been really angry with r and i feel like a... anger\n", "4 4 i feel suspicious if there is no one outside l... neutral" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('/kaggle/input/emotion-analysis-based-on-text/emotion_sentimen_dataset.csv', encoding='utf-8')\n", "df.head()" ] }, { "cell_type": "markdown", "id": "cdad6e1a", "metadata": { "papermill": { "duration": 0.006328, "end_time": "2024-06-07T07:59:52.089627", "exception": false, "start_time": "2024-06-07T07:59:52.083299", "status": "completed" }, "tags": [] }, "source": [ "## 数据预处理" ] }, { "cell_type": "markdown", "id": "980ecdc4", "metadata": { "papermill": { "duration": 0.006119, "end_time": "2024-06-07T07:59:52.102058", "exception": false, "start_time": "2024-06-07T07:59:52.095939", "status": "completed" }, "tags": [] }, "source": [ "### 去掉'Unnamed:0'列" ] }, { "cell_type": "code", "execution_count": 3, "id": "b6b98b5e", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T07:59:52.116308Z", "iopub.status.busy": "2024-06-07T07:59:52.115936Z", "iopub.status.idle": "2024-06-07T07:59:52.125497Z", "shell.execute_reply": "2024-06-07T07:59:52.124514Z" }, "papermill": { "duration": 0.019378, "end_time": "2024-06-07T07:59:52.127784", "exception": false, "start_time": "2024-06-07T07:59:52.108406", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'i seriously hate one subject to death but now i feel reluctant to drop it'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "if 'Unnamed: 0' in df.columns:\n", " del df['Unnamed: 0']\n", "df.loc[0]['text']" ] }, { "cell_type": "markdown", "id": "13ba4d1b", "metadata": { "papermill": { "duration": 0.006843, "end_time": "2024-06-07T07:59:52.141633", "exception": false, "start_time": "2024-06-07T07:59:52.134790", "status": "completed" }, "tags": [] }, "source": [ "### 检查缺失值" ] }, { "cell_type": "code", "execution_count": 4, "id": "aaf443cd", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T07:59:52.156361Z", "iopub.status.busy": "2024-06-07T07:59:52.155702Z", "iopub.status.idle": "2024-06-07T07:59:52.260417Z", "shell.execute_reply": "2024-06-07T07:59:52.259372Z" }, "papermill": { "duration": 0.114358, "end_time": "2024-06-07T07:59:52.262476", "exception": false, "start_time": "2024-06-07T07:59:52.148118", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "text False\n", "Emotion False\n", "dtype: bool" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().any()" ] }, { "cell_type": "markdown", "id": "d210bd7d", "metadata": { "papermill": { "duration": 0.006379, "end_time": "2024-06-07T07:59:52.275724", "exception": false, "start_time": "2024-06-07T07:59:52.269345", "status": "completed" }, "tags": [] }, "source": [ "没有缺失值" ] }, { "cell_type": "markdown", "id": "b020d1da", "metadata": { "papermill": { "duration": 0.006849, "end_time": "2024-06-07T07:59:52.289204", "exception": false, "start_time": "2024-06-07T07:59:52.282355", "status": "completed" }, "tags": [] }, "source": [ "### 统计标签个数" ] }, { "cell_type": "code", "execution_count": 5, "id": "beeb611f", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T07:59:52.304207Z", "iopub.status.busy": "2024-06-07T07:59:52.303413Z", "iopub.status.idle": "2024-06-07T07:59:52.372271Z", "shell.execute_reply": "2024-06-07T07:59:52.371311Z" }, "papermill": { "duration": 0.078711, "end_time": "2024-06-07T07:59:52.374395", "exception": false, "start_time": "2024-06-07T07:59:52.295684", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Emotion\n", "neutral 674538\n", "love 39553\n", "happiness 27175\n", "sadness 17481\n", "relief 16729\n", "hate 15267\n", "anger 12336\n", "fun 10075\n", "enthusiasm 9304\n", "surprise 6954\n", "empty 5542\n", "worry 4475\n", "boredom 126\n", "Name: count, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Emotion'].value_counts()" ] }, { "cell_type": "markdown", "id": "c4325963", "metadata": { "papermill": { "duration": 0.006512, "end_time": "2024-06-07T07:59:52.387868", "exception": false, "start_time": "2024-06-07T07:59:52.381356", "status": "completed" }, "tags": [] }, "source": [ "neutral情绪居多" ] }, { "cell_type": "markdown", "id": "42a255ea", "metadata": { "papermill": { "duration": 0.00651, "end_time": "2024-06-07T07:59:52.401107", "exception": false, "start_time": "2024-06-07T07:59:52.394597", "status": "completed" }, "tags": [] }, "source": [ "### 过滤HTML" ] }, { "cell_type": "code", "execution_count": 6, "id": "f6340f4d", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T07:59:52.416407Z", "iopub.status.busy": "2024-06-07T07:59:52.415653Z", "iopub.status.idle": "2024-06-07T07:59:52.420831Z", "shell.execute_reply": "2024-06-07T07:59:52.419846Z" }, "papermill": { "duration": 0.014951, "end_time": "2024-06-07T07:59:52.422798", "exception": false, "start_time": "2024-06-07T07:59:52.407847", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def noiseremovel_text(text):\n", " soup = BeautifulSoup(text, \"html.parser\")\n", " text = soup.get_text()\n", " text = re.sub(r'\\[[^]]*\\]', '', text) \n", " return text" ] }, { "cell_type": "code", "execution_count": 7, "id": "324c0fd5", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T07:59:52.439372Z", "iopub.status.busy": "2024-06-07T07:59:52.438551Z", "iopub.status.idle": "2024-06-07T07:59:52.445213Z", "shell.execute_reply": "2024-06-07T07:59:52.444197Z" }, "papermill": { "duration": 0.016475, "end_time": "2024-06-07T07:59:52.447222", "exception": false, "start_time": "2024-06-07T07:59:52.430747", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'I really enjoyed the latest episode of my favorite show! Check out this link for a recap. #bestshowever '" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample_text = '
I really enjoyed the latest episode of my favorite show! [https://t.co/xyz123] Check out this link for a recap. #bestshowever [Ad: Stream now on MyStreamingService for 50% off!]
'\n", "trans_sample_text = noiseremovel_text(sample_text)\n", "trans_sample_text" ] }, { "cell_type": "code", "execution_count": 8, "id": "532f7045", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T07:59:52.462719Z", "iopub.status.busy": "2024-06-07T07:59:52.461914Z", "iopub.status.idle": "2024-06-07T08:00:34.124522Z", "shell.execute_reply": "2024-06-07T08:00:34.123423Z" }, "papermill": { "duration": 41.673227, "end_time": "2024-06-07T08:00:34.127258", "exception": false, "start_time": "2024-06-07T07:59:52.454031", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "df['text'] = df['text'].apply(noiseremovel_text)" ] }, { "cell_type": "markdown", "id": "c3b24680", "metadata": { "papermill": { "duration": 0.006637, "end_time": "2024-06-07T08:00:34.141272", "exception": false, "start_time": "2024-06-07T08:00:34.134635", "status": "completed" }, "tags": [] }, "source": [ "### 移除stopwords" ] }, { "cell_type": "code", "execution_count": 9, "id": "e48353c3", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T08:00:34.157003Z", "iopub.status.busy": "2024-06-07T08:00:34.156235Z", "iopub.status.idle": "2024-06-07T08:00:34.382132Z", "shell.execute_reply": "2024-06-07T08:00:34.381037Z" }, "papermill": { "duration": 0.23647, "end_time": "2024-06-07T08:00:34.384580", "exception": false, "start_time": "2024-06-07T08:00:34.148110", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /usr/share/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ], "source": [ "nltk.download('stopwords')\n", "stopwords = nltk.corpus.stopwords.words('english')\n", "stop_wr = set(stopwords)\n", "\n", "def remove_stopwords(text, stop_words):\n", " tokenizers = ToktokTokenizer()\n", " #提取单词和缩写\n", " words = re.findall(r'\\w+|\\.\\.+', text)\n", " stop_words = set(stop_words)\n", " filtokens = [i for i in words if i.lower() not in stop_words]\n", " # 连接\n", " filtered_text = ' '.join(filtokens)\n", " return filtered_text" ] }, { "cell_type": "code", "execution_count": 10, "id": "f328854d", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T08:00:34.402759Z", "iopub.status.busy": "2024-06-07T08:00:34.401319Z", "iopub.status.idle": "2024-06-07T08:00:34.411254Z", "shell.execute_reply": "2024-06-07T08:00:34.410112Z" }, "papermill": { "duration": 0.021777, "end_time": "2024-06-07T08:00:34.413600", "exception": false, "start_time": "2024-06-07T08:00:34.391823", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'seriously hate one subject death feel reluctant drop'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample_text = 'i seriously hate one subject to death but now i feel reluctant to drop it'\n", "trans_sample_text = remove_stopwords(sample_text, stop_wr)\n", "trans_sample_text" ] }, { "cell_type": "code", "execution_count": 11, "id": "36a11862", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T08:00:34.429514Z", "iopub.status.busy": "2024-06-07T08:00:34.429146Z", "iopub.status.idle": "2024-06-07T08:00:45.852555Z", "shell.execute_reply": "2024-06-07T08:00:45.851505Z" }, "papermill": { "duration": 11.434069, "end_time": "2024-06-07T08:00:45.855050", "exception": false, "start_time": "2024-06-07T08:00:34.420981", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "df['text'] = df['text'].apply(remove_stopwords, stop_words=stop_wr)" ] }, { "cell_type": "markdown", "id": "59754b96", "metadata": { "papermill": { "duration": 0.006971, "end_time": "2024-06-07T08:00:45.869210", "exception": false, "start_time": "2024-06-07T08:00:45.862239", "status": "completed" }, "tags": [] }, "source": [ "## 对情绪标签编码" ] }, { "cell_type": "code", "execution_count": 12, "id": "0d093d66", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T08:00:45.885460Z", "iopub.status.busy": "2024-06-07T08:00:45.885088Z", "iopub.status.idle": "2024-06-07T08:00:49.595642Z", "shell.execute_reply": "2024-06-07T08:00:49.594472Z" }, "papermill": { "duration": 3.721209, "end_time": "2024-06-07T08:00:49.597909", "exception": false, "start_time": "2024-06-07T08:00:45.876700", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(839555, 13)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label_binarizer = LabelBinarizer()\n", "\n", "emotion_encoded = label_binarizer.fit_transform(df['Emotion'])\n", "emotion_encoded.shape" ] }, { "cell_type": "markdown", "id": "f39d81be", "metadata": { "papermill": { "duration": 0.00708, "end_time": "2024-06-07T08:00:49.612623", "exception": false, "start_time": "2024-06-07T08:00:49.605543", "status": "completed" }, "tags": [] }, "source": [ "## 文本特征向量化" ] }, { "cell_type": "code", "execution_count": 13, "id": "67333b0c", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T08:00:49.629513Z", "iopub.status.busy": "2024-06-07T08:00:49.628471Z", "iopub.status.idle": "2024-06-07T08:00:58.957626Z", "shell.execute_reply": "2024-06-07T08:00:58.956558Z" }, "papermill": { "duration": 9.340419, "end_time": "2024-06-07T08:00:58.960340", "exception": false, "start_time": "2024-06-07T08:00:49.619921", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "vectorizer = CountVectorizer()\n", "X_bow = vectorizer.fit_transform(df['text'])\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X_bow, df['Emotion'], test_size=0.2, random_state=42)\n", "\n", "logistic = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)" ] }, { "cell_type": "markdown", "id": "b6e72f70", "metadata": { "papermill": { "duration": 0.007038, "end_time": "2024-06-07T08:00:58.974906", "exception": false, "start_time": "2024-06-07T08:00:58.967868", "status": "completed" }, "tags": [] }, "source": [ "## 训练" ] }, { "cell_type": "code", "execution_count": 14, "id": "c02112ed", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T08:00:58.991472Z", "iopub.status.busy": "2024-06-07T08:00:58.991101Z", "iopub.status.idle": "2024-06-07T08:04:00.220701Z", "shell.execute_reply": "2024-06-07T08:04:00.219324Z" }, "papermill": { "duration": 181.241697, "end_time": "2024-06-07T08:04:00.224168", "exception": false, "start_time": "2024-06-07T08:00:58.982471", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "lr_bow = logistic.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "id": "a933590f", "metadata": { "papermill": { "duration": 0.011677, "end_time": "2024-06-07T08:04:00.248922", "exception": false, "start_time": "2024-06-07T08:04:00.237245", "status": "completed" }, "tags": [] }, "source": [ "## 保存" ] }, { "cell_type": "code", "execution_count": 15, "id": "c7529929", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T08:04:00.265919Z", "iopub.status.busy": "2024-06-07T08:04:00.264742Z", "iopub.status.idle": "2024-06-07T08:04:01.081469Z", "shell.execute_reply": "2024-06-07T08:04:01.080435Z" }, "papermill": { "duration": 0.82782, "end_time": "2024-06-07T08:04:01.084194", "exception": false, "start_time": "2024-06-07T08:04:00.256374", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "joblib.dump(vectorizer, 'vectorizer.joblib')\n", "joblib.dump(lr_bow, 'model.joblib')\n", "\n", "loaded_vectorizer = joblib.load('vectorizer.joblib')\n", "loaded_model = joblib.load('model.joblib')" ] }, { "cell_type": "markdown", "id": "57657a5e", "metadata": { "papermill": { "duration": 0.007039, "end_time": "2024-06-07T08:04:01.098694", "exception": false, "start_time": "2024-06-07T08:04:01.091655", "status": "completed" }, "tags": [] }, "source": [ "## 评估" ] }, { "cell_type": "code", "execution_count": 16, "id": "688b5e38", "metadata": { "execution": { "iopub.execute_input": "2024-06-07T08:04:01.114633Z", "iopub.status.busy": "2024-06-07T08:04:01.113895Z", "iopub.status.idle": "2024-06-07T08:04:08.550523Z", "shell.execute_reply": "2024-06-07T08:04:08.549515Z" }, "papermill": { "duration": 7.447225, "end_time": "2024-06-07T08:04:08.552950", "exception": false, "start_time": "2024-06-07T08:04:01.105725", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9983324499288313\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " anger 1.00 0.99 0.99 2489\n", " boredom 1.00 0.95 0.98 21\n", " empty 1.00 0.98 0.99 1096\n", " enthusiasm 1.00 0.99 1.00 1839\n", " fun 1.00 0.98 0.99 1977\n", " happiness 1.00 1.00 1.00 5370\n", " hate 1.00 0.99 1.00 3018\n", " love 1.00 1.00 1.00 8001\n", " neutral 1.00 1.00 1.00 134999\n", " relief 1.00 0.98 0.99 3396\n", " sadness 1.00 0.99 0.99 3428\n", " surprise 1.00 0.99 1.00 1372\n", " worry 1.00 0.99 1.00 905\n", "\n", " accuracy 1.00 167911\n", " macro avg 1.00 0.99 0.99 167911\n", "weighted avg 1.00 1.00 1.00 167911\n", "\n" ] } ], "source": [ "# 评估\n", "y_pred = loaded_model.predict(X_test)\n", "\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy:\", accuracy)\n", "\n", "print(\"Classification Report:\")\n", "print(classification_report(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": null, "id": "ea741f3a", "metadata": { "papermill": { "duration": 0.007129, "end_time": "2024-06-07T08:04:08.567812", "exception": false, "start_time": "2024-06-07T08:04:08.560683", "status": "completed" }, "tags": [] }, "outputs": [], "source": [] } ], "metadata": { "kaggle": { "accelerator": "none", "dataSources": [ { "datasetId": 4540583, "sourceId": 7763359, "sourceType": "datasetVersion" } ], "dockerImageVersionId": 30732, "isGpuEnabled": false, "isInternetEnabled": true, "language": "python", "sourceType": "notebook" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" }, "papermill": { "default_parameters": {}, "duration": 264.700867, "end_time": "2024-06-07T08:04:09.296576", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2024-06-07T07:59:44.595709", "version": "2.5.0" } }, "nbformat": 4, "nbformat_minor": 5 }