0x-74 commited on
Commit
e941256
·
1 Parent(s): b973975

final_deployment

Browse files
Files changed (11) hide show
  1. .gitattributes +0 -35
  2. .gitignore +1 -0
  3. Dockerfile +13 -0
  4. README.md +0 -10
  5. data.csv +0 -0
  6. encoder.pkl +0 -0
  7. helper.py +39 -0
  8. main.ipynb +875 -0
  9. main.py +83 -0
  10. model.pkl +0 -0
  11. requirements.txt +7 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md DELETED
@@ -1,10 +0,0 @@
1
- ---
2
- title: Dps Challenge
3
- emoji: 🔥
4
- colorFrom: indigo
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
data.csv ADDED
The diff for this file is too large to render. See raw diff
 
encoder.pkl ADDED
Binary file (1.14 kB). View file
 
helper.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import calendar
2
+ import pandas as pd
3
+ from sklearn.preprocessing import OneHotEncoder
4
+
5
+ def convert_to_month_name(year_month):
6
+
7
+ if year_month == 'Summe':
8
+ return 'Summe'
9
+ month = str(year_month)[4:6]
10
+
11
+
12
+ month_name = calendar.month_name[int(month)]
13
+ return month_name
14
+
15
+
16
+ def transform_new_data(new_data, encoder, original_one_hot_columns):
17
+
18
+ new_data_copy = new_data.copy()
19
+
20
+
21
+ encoded_columns = encoder.transform(new_data_copy[original_one_hot_columns])
22
+
23
+
24
+ encoded_column_names = encoder.get_feature_names_out(original_one_hot_columns)
25
+
26
+
27
+ encoded_df = pd.DataFrame(
28
+ encoded_columns,
29
+ columns=encoded_column_names,
30
+ index=new_data_copy.index
31
+ )
32
+
33
+
34
+ result_df = pd.concat([
35
+ new_data_copy.drop(columns=original_one_hot_columns),
36
+ encoded_df
37
+ ], axis=1)
38
+
39
+ return result_df
main.ipynb ADDED
@@ -0,0 +1,875 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 1. loading data\n",
8
+ "## installing packages "
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 79,
14
+ "metadata": {
15
+ "id": "E7Ae3ZiczQVT"
16
+ },
17
+ "outputs": [],
18
+ "source": [
19
+ "%pip install seaborn numpy pandas xgboost -qqU"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "markdown",
24
+ "metadata": {},
25
+ "source": [
26
+ "# Importing libs"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "metadata": {
33
+ "id": "Sesct3fTzQVW"
34
+ },
35
+ "outputs": [],
36
+ "source": [
37
+ "import pandas as pd\n",
38
+ "import numpy as np\n",
39
+ "import seaborn as sns\n",
40
+ "import matplotlib.pyplot as plt\n",
41
+ "import pandas as pd\n",
42
+ "from sklearn.model_selection import train_test_split, GridSearchCV\n",
43
+ "from xgboost import XGBRegressor\n",
44
+ "import xgboost\n",
45
+ "from sklearn.metrics import mean_squared_error\n",
46
+ "import seaborn as sns\n",
47
+ "import matplotlib.pyplot as plt\n",
48
+ "import calendar\n",
49
+ "from sklearn.preprocessing import OneHotEncoder\n",
50
+ "import pickle"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "metadata": {},
56
+ "source": [
57
+ "# reading data and discarding data of years after 2019"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": 122,
63
+ "metadata": {
64
+ "id": "WGqXaM-XzQVX"
65
+ },
66
+ "outputs": [],
67
+ "source": [
68
+ "data = pd.read_csv('data.csv')"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 123,
74
+ "metadata": {
75
+ "id": "zBBpsXqszQVY"
76
+ },
77
+ "outputs": [],
78
+ "source": [
79
+ "data = data[~(data['JAHR'] > 2019)]\n",
80
+ "data = data[data.columns[:5]]"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 124,
86
+ "metadata": {
87
+ "id": "r5SG6rjMzQVY"
88
+ },
89
+ "outputs": [],
90
+ "source": [
91
+ "data.reset_index(drop=True, inplace=True)"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "markdown",
96
+ "metadata": {},
97
+ "source": [
98
+ "## here I removed the outliers outside z-threshold because these values often dont provide any useful information or help converge the model"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 125,
104
+ "metadata": {
105
+ "id": "jJVSlcgGzQVZ"
106
+ },
107
+ "outputs": [],
108
+ "source": [
109
+ "def remove_outliers(df, columns=None, z_threshold=3):\n",
110
+ "\n",
111
+ " if columns is None:\n",
112
+ " columns = df.select_dtypes(include=[np.number]).columns\n",
113
+ "\n",
114
+ "\n",
115
+ " df_clean = df.copy()\n",
116
+ "\n",
117
+ "\n",
118
+ " for col in columns:\n",
119
+ " z_scores = np.abs((df_clean[col] - df_clean[col].mean()) / df_clean[col].std())\n",
120
+ " df_clean = df_clean[z_scores < z_threshold]\n",
121
+ "\n",
122
+ " return df_clean\n",
123
+ "\n",
124
+ "\n",
125
+ "data = remove_outliers(data)\n"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": null,
131
+ "metadata": {
132
+ "colab": {
133
+ "base_uri": "https://localhost:8080/",
134
+ "height": 234
135
+ },
136
+ "id": "hYPXBmXfzQVa",
137
+ "outputId": "a153268a-04a1-483a-f4f1-7fd250224248"
138
+ },
139
+ "outputs": [
140
+ {
141
+ "data": {
142
+ "text/html": [
143
+ "<div>\n",
144
+ "<style scoped>\n",
145
+ " .dataframe tbody tr th:only-of-type {\n",
146
+ " vertical-align: middle;\n",
147
+ " }\n",
148
+ "\n",
149
+ " .dataframe tbody tr th {\n",
150
+ " vertical-align: top;\n",
151
+ " }\n",
152
+ "\n",
153
+ " .dataframe thead th {\n",
154
+ " text-align: right;\n",
155
+ " }\n",
156
+ "</style>\n",
157
+ "<table border=\"1\" class=\"dataframe\">\n",
158
+ " <thead>\n",
159
+ " <tr style=\"text-align: right;\">\n",
160
+ " <th></th>\n",
161
+ " <th>0</th>\n",
162
+ " </tr>\n",
163
+ " </thead>\n",
164
+ " <tbody>\n",
165
+ " <tr>\n",
166
+ " <th>MONATSZAHL</th>\n",
167
+ " <td>0</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>AUSPRAEGUNG</th>\n",
171
+ " <td>0</td>\n",
172
+ " </tr>\n",
173
+ " <tr>\n",
174
+ " <th>JAHR</th>\n",
175
+ " <td>0</td>\n",
176
+ " </tr>\n",
177
+ " <tr>\n",
178
+ " <th>MONAT</th>\n",
179
+ " <td>0</td>\n",
180
+ " </tr>\n",
181
+ " <tr>\n",
182
+ " <th>WERT</th>\n",
183
+ " <td>0</td>\n",
184
+ " </tr>\n",
185
+ " </tbody>\n",
186
+ "</table>\n",
187
+ "</div><br><label><b>dtype:</b> int64</label>"
188
+ ],
189
+ "text/plain": [
190
+ "MONATSZAHL 0\n",
191
+ "AUSPRAEGUNG 0\n",
192
+ "JAHR 0\n",
193
+ "MONAT 0\n",
194
+ "WERT 0\n",
195
+ "dtype: int64"
196
+ ]
197
+ },
198
+ "execution_count": 126,
199
+ "metadata": {},
200
+ "output_type": "execute_result"
201
+ }
202
+ ],
203
+ "source": [
204
+ "# checked for null values\n",
205
+ "data.isna().sum()"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "markdown",
210
+ "metadata": {},
211
+ "source": [
212
+ "# 2 . visualizing the data as per the assignment"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": 127,
218
+ "metadata": {
219
+ "colab": {
220
+ "base_uri": "https://localhost:8080/",
221
+ "height": 421
222
+ },
223
+ "id": "2KxsVKNB4bCE",
224
+ "outputId": "06590c0a-c2b2-44a9-f293-5513eeae6a56"
225
+ },
226
+ "outputs": [
227
+ {
228
+ "data": {
229
+ "image/png": "",
230
+ "text/plain": [
231
+ "<Figure size 1000x600 with 1 Axes>"
232
+ ]
233
+ },
234
+ "metadata": {},
235
+ "output_type": "display_data"
236
+ }
237
+ ],
238
+ "source": [
239
+ "value_counts = data['MONATSZAHL'].value_counts()\n",
240
+ "plt.figure(figsize=(10, 6))\n",
241
+ "sns.barplot(x=value_counts.index, y=value_counts.values)\n",
242
+ "plt.xlabel('MONATSZAHL')\n",
243
+ "plt.ylabel('Count')\n",
244
+ "plt.title('Distribution of Unique Values in MONATSZAHL')\n",
245
+ "plt.show()\n"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "markdown",
250
+ "metadata": {},
251
+ "source": [
252
+ "## i saw some redundant information here so i converted the months to only have the months because the year would already provide relevant insight"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": 128,
258
+ "metadata": {
259
+ "id": "oYWFkNVw6GuR"
260
+ },
261
+ "outputs": [],
262
+ "source": [
263
+ "def convert_to_month_name(year_month):\n",
264
+ "\n",
265
+ " if year_month == 'Summe':\n",
266
+ " return 'Summe'\n",
267
+ " month = str(year_month)[4:6]\n",
268
+ "\n",
269
+ "\n",
270
+ " month_name = calendar.month_name[int(month)]\n",
271
+ " return month_name\n",
272
+ "data['MONAT'] = data['MONAT'].apply(convert_to_month_name)"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": 129,
278
+ "metadata": {
279
+ "colab": {
280
+ "base_uri": "https://localhost:8080/",
281
+ "height": 509
282
+ },
283
+ "id": "2nZgB2c28Vub",
284
+ "outputId": "96251141-a163-49e9-ea10-6bfedc892cc6"
285
+ },
286
+ "outputs": [
287
+ {
288
+ "data": {
289
+ "text/html": [
290
+ "<div>\n",
291
+ "<style scoped>\n",
292
+ " .dataframe tbody tr th:only-of-type {\n",
293
+ " vertical-align: middle;\n",
294
+ " }\n",
295
+ "\n",
296
+ " .dataframe tbody tr th {\n",
297
+ " vertical-align: top;\n",
298
+ " }\n",
299
+ "\n",
300
+ " .dataframe thead th {\n",
301
+ " text-align: right;\n",
302
+ " }\n",
303
+ "</style>\n",
304
+ "<table border=\"1\" class=\"dataframe\">\n",
305
+ " <thead>\n",
306
+ " <tr style=\"text-align: right;\">\n",
307
+ " <th></th>\n",
308
+ " <th>count</th>\n",
309
+ " </tr>\n",
310
+ " <tr>\n",
311
+ " <th>MONAT</th>\n",
312
+ " <th></th>\n",
313
+ " </tr>\n",
314
+ " </thead>\n",
315
+ " <tbody>\n",
316
+ " <tr>\n",
317
+ " <th>January</th>\n",
318
+ " <td>140</td>\n",
319
+ " </tr>\n",
320
+ " <tr>\n",
321
+ " <th>March</th>\n",
322
+ " <td>140</td>\n",
323
+ " </tr>\n",
324
+ " <tr>\n",
325
+ " <th>February</th>\n",
326
+ " <td>140</td>\n",
327
+ " </tr>\n",
328
+ " <tr>\n",
329
+ " <th>April</th>\n",
330
+ " <td>140</td>\n",
331
+ " </tr>\n",
332
+ " <tr>\n",
333
+ " <th>May</th>\n",
334
+ " <td>140</td>\n",
335
+ " </tr>\n",
336
+ " <tr>\n",
337
+ " <th>September</th>\n",
338
+ " <td>140</td>\n",
339
+ " </tr>\n",
340
+ " <tr>\n",
341
+ " <th>June</th>\n",
342
+ " <td>140</td>\n",
343
+ " </tr>\n",
344
+ " <tr>\n",
345
+ " <th>July</th>\n",
346
+ " <td>140</td>\n",
347
+ " </tr>\n",
348
+ " <tr>\n",
349
+ " <th>August</th>\n",
350
+ " <td>140</td>\n",
351
+ " </tr>\n",
352
+ " <tr>\n",
353
+ " <th>November</th>\n",
354
+ " <td>140</td>\n",
355
+ " </tr>\n",
356
+ " <tr>\n",
357
+ " <th>October</th>\n",
358
+ " <td>140</td>\n",
359
+ " </tr>\n",
360
+ " <tr>\n",
361
+ " <th>December</th>\n",
362
+ " <td>140</td>\n",
363
+ " </tr>\n",
364
+ " <tr>\n",
365
+ " <th>Summe</th>\n",
366
+ " <td>114</td>\n",
367
+ " </tr>\n",
368
+ " </tbody>\n",
369
+ "</table>\n",
370
+ "</div><br><label><b>dtype:</b> int64</label>"
371
+ ],
372
+ "text/plain": [
373
+ "MONAT\n",
374
+ "January 140\n",
375
+ "March 140\n",
376
+ "February 140\n",
377
+ "April 140\n",
378
+ "May 140\n",
379
+ "September 140\n",
380
+ "June 140\n",
381
+ "July 140\n",
382
+ "August 140\n",
383
+ "November 140\n",
384
+ "October 140\n",
385
+ "December 140\n",
386
+ "Summe 114\n",
387
+ "Name: count, dtype: int64"
388
+ ]
389
+ },
390
+ "execution_count": 129,
391
+ "metadata": {},
392
+ "output_type": "execute_result"
393
+ }
394
+ ],
395
+ "source": [
396
+ "data['MONAT'].value_counts()"
397
+ ]
398
+ },
399
+ {
400
+ "cell_type": "code",
401
+ "execution_count": 130,
402
+ "metadata": {
403
+ "colab": {
404
+ "base_uri": "https://localhost:8080/",
405
+ "height": 723
406
+ },
407
+ "id": "5Dh3SgWE82hi",
408
+ "outputId": "5455f4dd-88ea-4626-8fc8-000cfcc0afb5"
409
+ },
410
+ "outputs": [
411
+ {
412
+ "data": {
413
+ "text/html": [
414
+ "<div>\n",
415
+ "<style scoped>\n",
416
+ " .dataframe tbody tr th:only-of-type {\n",
417
+ " vertical-align: middle;\n",
418
+ " }\n",
419
+ "\n",
420
+ " .dataframe tbody tr th {\n",
421
+ " vertical-align: top;\n",
422
+ " }\n",
423
+ "\n",
424
+ " .dataframe thead th {\n",
425
+ " text-align: right;\n",
426
+ " }\n",
427
+ "</style>\n",
428
+ "<table border=\"1\" class=\"dataframe\">\n",
429
+ " <thead>\n",
430
+ " <tr style=\"text-align: right;\">\n",
431
+ " <th></th>\n",
432
+ " <th>count</th>\n",
433
+ " </tr>\n",
434
+ " <tr>\n",
435
+ " <th>JAHR</th>\n",
436
+ " <th></th>\n",
437
+ " </tr>\n",
438
+ " </thead>\n",
439
+ " <tbody>\n",
440
+ " <tr>\n",
441
+ " <th>2019</th>\n",
442
+ " <td>90</td>\n",
443
+ " </tr>\n",
444
+ " <tr>\n",
445
+ " <th>2018</th>\n",
446
+ " <td>90</td>\n",
447
+ " </tr>\n",
448
+ " <tr>\n",
449
+ " <th>2017</th>\n",
450
+ " <td>90</td>\n",
451
+ " </tr>\n",
452
+ " <tr>\n",
453
+ " <th>2016</th>\n",
454
+ " <td>90</td>\n",
455
+ " </tr>\n",
456
+ " <tr>\n",
457
+ " <th>2015</th>\n",
458
+ " <td>90</td>\n",
459
+ " </tr>\n",
460
+ " <tr>\n",
461
+ " <th>2014</th>\n",
462
+ " <td>90</td>\n",
463
+ " </tr>\n",
464
+ " <tr>\n",
465
+ " <th>2013</th>\n",
466
+ " <td>90</td>\n",
467
+ " </tr>\n",
468
+ " <tr>\n",
469
+ " <th>2012</th>\n",
470
+ " <td>90</td>\n",
471
+ " </tr>\n",
472
+ " <tr>\n",
473
+ " <th>2011</th>\n",
474
+ " <td>90</td>\n",
475
+ " </tr>\n",
476
+ " <tr>\n",
477
+ " <th>2010</th>\n",
478
+ " <td>90</td>\n",
479
+ " </tr>\n",
480
+ " <tr>\n",
481
+ " <th>2009</th>\n",
482
+ " <td>90</td>\n",
483
+ " </tr>\n",
484
+ " <tr>\n",
485
+ " <th>2008</th>\n",
486
+ " <td>90</td>\n",
487
+ " </tr>\n",
488
+ " <tr>\n",
489
+ " <th>2007</th>\n",
490
+ " <td>90</td>\n",
491
+ " </tr>\n",
492
+ " <tr>\n",
493
+ " <th>2006</th>\n",
494
+ " <td>90</td>\n",
495
+ " </tr>\n",
496
+ " <tr>\n",
497
+ " <th>2005</th>\n",
498
+ " <td>90</td>\n",
499
+ " </tr>\n",
500
+ " <tr>\n",
501
+ " <th>2004</th>\n",
502
+ " <td>90</td>\n",
503
+ " </tr>\n",
504
+ " <tr>\n",
505
+ " <th>2003</th>\n",
506
+ " <td>90</td>\n",
507
+ " </tr>\n",
508
+ " <tr>\n",
509
+ " <th>2002</th>\n",
510
+ " <td>90</td>\n",
511
+ " </tr>\n",
512
+ " <tr>\n",
513
+ " <th>2001</th>\n",
514
+ " <td>90</td>\n",
515
+ " </tr>\n",
516
+ " <tr>\n",
517
+ " <th>2000</th>\n",
518
+ " <td>84</td>\n",
519
+ " </tr>\n",
520
+ " </tbody>\n",
521
+ "</table>\n",
522
+ "</div><br><label><b>dtype:</b> int64</label>"
523
+ ],
524
+ "text/plain": [
525
+ "JAHR\n",
526
+ "2019 90\n",
527
+ "2018 90\n",
528
+ "2017 90\n",
529
+ "2016 90\n",
530
+ "2015 90\n",
531
+ "2014 90\n",
532
+ "2013 90\n",
533
+ "2012 90\n",
534
+ "2011 90\n",
535
+ "2010 90\n",
536
+ "2009 90\n",
537
+ "2008 90\n",
538
+ "2007 90\n",
539
+ "2006 90\n",
540
+ "2005 90\n",
541
+ "2004 90\n",
542
+ "2003 90\n",
543
+ "2002 90\n",
544
+ "2001 90\n",
545
+ "2000 84\n",
546
+ "Name: count, dtype: int64"
547
+ ]
548
+ },
549
+ "execution_count": 130,
550
+ "metadata": {},
551
+ "output_type": "execute_result"
552
+ }
553
+ ],
554
+ "source": [
555
+ "data['JAHR'].value_counts()"
556
+ ]
557
+ },
558
+ {
559
+ "cell_type": "markdown",
560
+ "metadata": {},
561
+ "source": [
562
+ "### after checking for data imbalances , now we can train the model, we first one hot all the categorical columns, another good approach according to me would be to just use cardinal encoding but my results didnt seem excessively differing so i went with one hot"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": 134,
568
+ "metadata": {
569
+ "id": "O1m3dRKbGTN9"
570
+ },
571
+ "outputs": [],
572
+ "source": [
573
+ "one_hot_columns = data.columns[0:4]"
574
+ ]
575
+ },
576
+ {
577
+ "cell_type": "code",
578
+ "execution_count": 136,
579
+ "metadata": {
580
+ "id": "poFSghx08_ig"
581
+ },
582
+ "outputs": [],
583
+ "source": [
584
+ "\n",
585
+ "def one_hot_encode(data, one_hot_columns):\n",
586
+ "\n",
587
+ " data_copy = data.copy()\n",
588
+ "\n",
589
+ "\n",
590
+ " encoder = OneHotEncoder(sparse_output=False)\n",
591
+ "\n",
592
+ "\n",
593
+ " encoded_columns = encoder.fit_transform(data_copy[one_hot_columns])\n",
594
+ "\n",
595
+ "\n",
596
+ " encoded_column_names = encoder.get_feature_names_out(one_hot_columns)\n",
597
+ "\n",
598
+ "\n",
599
+ " encoded_df = pd.DataFrame(\n",
600
+ " encoded_columns,\n",
601
+ " columns=encoded_column_names,\n",
602
+ " index=data_copy.index\n",
603
+ " )\n",
604
+ "\n",
605
+ "\n",
606
+ " result_df = pd.concat([\n",
607
+ " data_copy.drop(columns=one_hot_columns),\n",
608
+ " encoded_df\n",
609
+ " ], axis=1)\n",
610
+ "\n",
611
+ " return result_df, encoder\n",
612
+ "\n",
613
+ "def transform_new_data(new_data, encoder, original_one_hot_columns):\n",
614
+ "\n",
615
+ " new_data_copy = new_data.copy()\n",
616
+ "\n",
617
+ "\n",
618
+ " encoded_columns = encoder.transform(new_data_copy[original_one_hot_columns])\n",
619
+ "\n",
620
+ "\n",
621
+ " encoded_column_names = encoder.get_feature_names_out(original_one_hot_columns)\n",
622
+ "\n",
623
+ "\n",
624
+ " encoded_df = pd.DataFrame(\n",
625
+ " encoded_columns,\n",
626
+ " columns=encoded_column_names,\n",
627
+ " index=new_data_copy.index\n",
628
+ " )\n",
629
+ "\n",
630
+ "\n",
631
+ " result_df = pd.concat([\n",
632
+ " new_data_copy.drop(columns=original_one_hot_columns),\n",
633
+ " encoded_df\n",
634
+ " ], axis=1)\n",
635
+ "\n",
636
+ " return result_df"
637
+ ]
638
+ },
639
+ {
640
+ "cell_type": "code",
641
+ "execution_count": 139,
642
+ "metadata": {
643
+ "id": "3a18WFefHY3F"
644
+ },
645
+ "outputs": [],
646
+ "source": [
647
+ "data, encoder = one_hot_encode(data, one_hot_columns)"
648
+ ]
649
+ },
650
+ {
651
+ "cell_type": "code",
652
+ "execution_count": 93,
653
+ "metadata": {
654
+ "colab": {
655
+ "base_uri": "https://localhost:8080/"
656
+ },
657
+ "id": "n8x7Qli20z_-",
658
+ "outputId": "bd6fd971-0f61-4a08-b5e1-9863b285ea71"
659
+ },
660
+ "outputs": [
661
+ {
662
+ "name": "stdout",
663
+ "output_type": "stream",
664
+ "text": [
665
+ "{'BUILTIN_PREFETCH_PRESENT': True, 'CUDA_VERSION': [11, 8], 'DEBUG': False, 'GCC_VERSION': [10, 3, 1], 'MM_PREFETCH_PRESENT': True, 'NCCL_VERSION': [2, 16, 5], 'THRUST_VERSION': [1, 15, 1], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': True, 'USE_FEDERATED': True, 'USE_NCCL': True, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': '/usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so'}\n"
666
+ ]
667
+ }
668
+ ],
669
+ "source": [
670
+ "print(xgboost.build_info())"
671
+ ]
672
+ },
673
+ {
674
+ "cell_type": "markdown",
675
+ "metadata": {},
676
+ "source": [
677
+ "# 3. finally training the model and downloading it as pkl to use in api"
678
+ ]
679
+ },
680
+ {
681
+ "cell_type": "code",
682
+ "execution_count": 142,
683
+ "metadata": {
684
+ "colab": {
685
+ "base_uri": "https://localhost:8080/"
686
+ },
687
+ "id": "EVn2xhzhzQVa",
688
+ "outputId": "6218d7ed-0415-45d3-bb38-73cd009bdf5f"
689
+ },
690
+ "outputs": [
691
+ {
692
+ "name": "stdout",
693
+ "output_type": "stream",
694
+ "text": [
695
+ "Fitting 3 folds for each of 243 candidates, totalling 729 fits\n"
696
+ ]
697
+ },
698
+ {
699
+ "name": "stderr",
700
+ "output_type": "stream",
701
+ "text": [
702
+ "/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:01:28] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.\n",
703
+ "\n",
704
+ " E.g. tree_method = \"hist\", device = \"cuda\"\n",
705
+ "\n",
706
+ " warnings.warn(smsg, UserWarning)\n",
707
+ "/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:01:28] WARNING: /workspace/src/learner.cc:740: \n",
708
+ "Parameters: { \"predictor\" } are not used.\n",
709
+ "\n",
710
+ " warnings.warn(smsg, UserWarning)\n"
711
+ ]
712
+ },
713
+ {
714
+ "name": "stdout",
715
+ "output_type": "stream",
716
+ "text": [
717
+ "Best parameters found: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.7}\n",
718
+ " param_n_estimators param_learning_rate param_max_depth \\\n",
719
+ "0 100 0.01 3 \n",
720
+ "1 100 0.01 3 \n",
721
+ "2 100 0.01 3 \n",
722
+ "3 200 0.01 3 \n",
723
+ "4 200 0.01 3 \n",
724
+ ".. ... ... ... \n",
725
+ "238 200 0.20 7 \n",
726
+ "239 200 0.20 7 \n",
727
+ "240 300 0.20 7 \n",
728
+ "241 300 0.20 7 \n",
729
+ "242 300 0.20 7 \n",
730
+ "\n",
731
+ " param_subsample param_colsample_bytree mean_test_score \n",
732
+ "0 0.7 0.7 836731.772750 \n",
733
+ "1 0.8 0.7 829332.460518 \n",
734
+ "2 0.9 0.7 829277.959373 \n",
735
+ "3 0.7 0.7 359457.295642 \n",
736
+ "4 0.8 0.7 351281.456970 \n",
737
+ ".. ... ... ... \n",
738
+ "238 0.8 0.9 31653.504011 \n",
739
+ "239 0.9 0.9 31929.875660 \n",
740
+ "240 0.7 0.9 32102.913523 \n",
741
+ "241 0.8 0.9 31576.184742 \n",
742
+ "242 0.9 0.9 31894.048866 \n",
743
+ "\n",
744
+ "[243 rows x 6 columns]\n",
745
+ "Mean Squared Error on the test set: 28483.477323930427\n"
746
+ ]
747
+ },
748
+ {
749
+ "name": "stderr",
750
+ "output_type": "stream",
751
+ "text": [
752
+ "/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:01:29] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.\n",
753
+ "\n",
754
+ " E.g. tree_method = \"hist\", device = \"cuda\"\n",
755
+ "\n",
756
+ " warnings.warn(smsg, UserWarning)\n"
757
+ ]
758
+ }
759
+ ],
760
+ "source": [
761
+ "X = data.drop(columns=['WERT'])\n",
762
+ "y = data['WERT']\n",
763
+ "\n",
764
+ "\n",
765
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
766
+ "\n",
767
+ "\n",
768
+ "xgb = XGBRegressor(\n",
769
+ " tree_method='gpu_hist',\n",
770
+ " predictor='gpu_predictor',\n",
771
+ " verbosity=2\n",
772
+ ")\n",
773
+ "\n",
774
+ "\n",
775
+ "param_grid = {\n",
776
+ " 'n_estimators': [100, 200, 300],\n",
777
+ " 'learning_rate': [0.01, 0.1, 0.2],\n",
778
+ " 'max_depth': [3, 5, 7],\n",
779
+ " 'subsample': [0.7, 0.8, 0.9],\n",
780
+ " 'colsample_bytree': [0.7, 0.8, 0.9]\n",
781
+ "}\n",
782
+ "\n",
783
+ "\n",
784
+ "grid_search = GridSearchCV(\n",
785
+ " estimator=xgb,\n",
786
+ " param_grid=param_grid,\n",
787
+ " cv=3,\n",
788
+ " scoring='neg_mean_squared_error',\n",
789
+ " verbose=2,\n",
790
+ " n_jobs=-1\n",
791
+ ")\n",
792
+ "\n",
793
+ "grid_search.fit(X_train, y_train)\n",
794
+ "\n",
795
+ "\n",
796
+ "best_params = grid_search.best_params_\n",
797
+ "print(\"Best parameters found: \", best_params)\n",
798
+ "\n",
799
+ "\n",
800
+ "results = pd.DataFrame(grid_search.cv_results_)\n",
801
+ "\n",
802
+ "\n",
803
+ "results['mean_test_score'] = -results['mean_test_score']\n",
804
+ "\n",
805
+ "\n",
806
+ "print(results[['param_n_estimators', 'param_learning_rate', 'param_max_depth', 'param_subsample', 'param_colsample_bytree', 'mean_test_score']])\n",
807
+ "\n",
808
+ "\n",
809
+ "best_model = grid_search.best_estimator_\n",
810
+ "y_pred = best_model.predict(X_test)\n",
811
+ "\n",
812
+ "mse = mean_squared_error(y_test, y_pred)\n",
813
+ "print(\"Mean Squared Error on the test set: \", mse)\n"
814
+ ]
815
+ },
816
+ {
817
+ "cell_type": "code",
818
+ "execution_count": 165,
819
+ "metadata": {
820
+ "id": "zcb7oWV0HwH5"
821
+ },
822
+ "outputs": [],
823
+ "source": [
824
+ "ex = pd.DataFrame({\n",
825
+ " 'MONATSZAHL': ['Alkoholunfälle'],\n",
826
+ " 'AUSPRAEGUNG': ['Verletzte und Getötete'],\n",
827
+ " 'JAHR': [2012],\n",
828
+ " 'MONAT': ['201207']\n",
829
+ "})\n",
830
+ "ex['MONAT'] = ex['MONAT'].apply(convert_to_month_name)\n",
831
+ "new = transform_new_data(ex, encoder, one_hot_columns)\n"
832
+ ]
833
+ },
834
+ {
835
+ "cell_type": "code",
836
+ "execution_count": null,
837
+ "metadata": {
838
+ "id": "EhSjmlIAOAGk"
839
+ },
840
+ "outputs": [],
841
+ "source": [
842
+ "with open('xgb.pkl', 'wb') as file:\n",
843
+ " pickle.dump(best_model, file)\n",
844
+ "\n",
845
+ "with open('encoder.pkl', 'wb') as file:\n",
846
+ " pickle.dump(encoder, file)"
847
+ ]
848
+ }
849
+ ],
850
+ "metadata": {
851
+ "accelerator": "GPU",
852
+ "colab": {
853
+ "gpuType": "T4",
854
+ "provenance": []
855
+ },
856
+ "kernelspec": {
857
+ "display_name": "Python 3",
858
+ "name": "python3"
859
+ },
860
+ "language_info": {
861
+ "codemirror_mode": {
862
+ "name": "ipython",
863
+ "version": 3
864
+ },
865
+ "file_extension": ".py",
866
+ "mimetype": "text/x-python",
867
+ "name": "python",
868
+ "nbconvert_exporter": "python",
869
+ "pygments_lexer": "ipython3",
870
+ "version": "3.12.6"
871
+ }
872
+ },
873
+ "nbformat": 4,
874
+ "nbformat_minor": 0
875
+ }
main.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel, ValidationError
4
+ import pandas as pd
5
+ from helper import convert_to_month_name, transform_new_data
6
+
7
+ # Load the model and encoder
8
+ model = joblib.load('model.pkl')
9
+ encoder = joblib.load('encoder.pkl')
10
+
11
+ app = FastAPI()
12
+
13
+ # Pydantic model for input data validation
14
+ class Item(BaseModel):
15
+ MONATSZAHL: str
16
+ AUSPRAEGUNG: str
17
+ JAHR: int
18
+ MONAT: str
19
+
20
+ # Endpoint for inference
21
+ @app.post("/predict/")
22
+ async def predict(item: Item):
23
+ try:
24
+ # Construct input data from request
25
+ input_data = {
26
+ "MONATSZAHL": item.MONATSZAHL,
27
+ "AUSPRAEGUNG": item.AUSPRAEGUNG,
28
+ "JAHR": item.JAHR,
29
+ "MONAT": item.MONAT
30
+ }
31
+
32
+ # Convert input data to DataFrame
33
+ input_df = pd.DataFrame([input_data])
34
+
35
+ # Convert 'MONAT' to month name
36
+ try:
37
+ input_df['MONAT'] = input_df['MONAT'].apply(convert_to_month_name)
38
+ except Exception as e:
39
+ raise HTTPException(
40
+ status_code=400,
41
+ detail=f"Error converting 'MONAT' to month name: {e}"
42
+ )
43
+
44
+ # Transform data with encoder
45
+ try:
46
+ transformed_df = transform_new_data(
47
+ input_df,
48
+ encoder,
49
+ original_one_hot_columns=['MONATSZAHL', 'AUSPRAEGUNG', "JAHR", 'MONAT']
50
+ )
51
+ except Exception as e:
52
+ raise HTTPException(
53
+ status_code=500,
54
+ detail=f"Error transforming data: {e}"
55
+ )
56
+
57
+ # Ensure the transformed data matches the model's expected input
58
+ try:
59
+ prediction = model.predict(transformed_df)
60
+ except Exception as e:
61
+ raise HTTPException(
62
+ status_code=500,
63
+ detail=f"Error during model prediction: {e}"
64
+ )
65
+
66
+ # Return prediction result
67
+ return {"prediction": prediction.tolist()}
68
+
69
+ except ValidationError as e:
70
+ raise HTTPException(
71
+ status_code=422,
72
+ detail=f"Validation error: {e}"
73
+ )
74
+ except KeyError as e:
75
+ raise HTTPException(
76
+ status_code=400,
77
+ detail=f"Missing expected column: {e}"
78
+ )
79
+ except Exception as e:
80
+ raise HTTPException(
81
+ status_code=500,
82
+ detail=f"Internal server error: {e}"
83
+ )
model.pkl ADDED
Binary file (712 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ pydantic
3
+ pandas
4
+ scikit-learn
5
+ joblib
6
+ uvicorn
7
+ xgboost