Spaces:
Sleeping
Sleeping
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"id": 1, | |
"source": [ | |
"# Data Science Analysis Notebook\n", | |
"\n", | |
"This notebook contains some example Python code for data analysis." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"id": 2, | |
"source": [ | |
"# Import libraries\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import seaborn as sns\n", | |
"\n", | |
"# Set visualization style\n", | |
"sns.set(style='whitegrid')\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"id": 3, | |
"source": [ | |
"# Load the dataset\n", | |
"df = pd.read_csv('housing_data.csv')\n", | |
"\n", | |
"# Display basic information\n", | |
"print(f\"Dataset shape: {df.shape}\")\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"id": 4, | |
"source": [ | |
"# Perform data cleaning\n", | |
"# Fill missing values with median\n", | |
"for column in df.columns:\n", | |
" if df[column].dtype in ['float64', 'int64']:\n", | |
" df[column].fillna(df[column].median(), inplace=True)\n", | |
" else:\n", | |
" df[column].fillna(df[column].mode()[0], inplace=True)\n", | |
"\n", | |
"# Check for remaining missing values\n", | |
"print(\"Missing values after cleaning:\")\n", | |
"print(df.isnull().sum())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"id": 5, | |
"source": [ | |
"# Exploratory data analysis\n", | |
"# Create correlation matrix\n", | |
"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n", | |
"correlation_matrix = df[numeric_columns].corr()\n", | |
"\n", | |
"# Plot heatmap\n", | |
"plt.figure(figsize=(12, 10))\n", | |
"sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)\n", | |
"plt.title('Correlation Matrix of Numeric Features', fontsize=18)\n", | |
"plt.xticks(rotation=45, ha='right')\n", | |
"plt.tight_layout()\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"id": 6, | |
"source": [ | |
"# Feature engineering\n", | |
"# Create new features\n", | |
"if 'bedrooms' in df.columns and 'total_rooms' in df.columns:\n", | |
" df['bedrooms_ratio'] = df['bedrooms'] / df['total_rooms']\n", | |
"\n", | |
"if 'total_rooms' in df.columns and 'households' in df.columns:\n", | |
" df['rooms_per_household'] = df['total_rooms'] / df['households']\n", | |
"\n", | |
"# Scale numeric features\n", | |
"from sklearn.preprocessing import StandardScaler\n", | |
"scaler = StandardScaler()\n", | |
"df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n", | |
"\n", | |
"# Display transformed data\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"id": 7, | |
"source": [ | |
"# Build a simple prediction model\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from sklearn.linear_model import LinearRegression\n", | |
"from sklearn.metrics import mean_squared_error, r2_score\n", | |
"\n", | |
"# Assume we're predicting median_house_value\n", | |
"if 'median_house_value' in df.columns:\n", | |
" # Prepare features and target\n", | |
" X = df.drop('median_house_value', axis=1)\n", | |
" y = df['median_house_value']\n", | |
" \n", | |
" # Split the data\n", | |
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", | |
" \n", | |
" # Train the model\n", | |
" model = LinearRegression()\n", | |
" model.fit(X_train, y_train)\n", | |
" \n", | |
" # Make predictions\n", | |
" y_pred = model.predict(X_test)\n", | |
" \n", | |
" # Evaluate the model\n", | |
" mse = mean_squared_error(y_test, y_pred)\n", | |
" r2 = r2_score(y_test, y_pred)\n", | |
" \n", | |
" print(f\"Mean Squared Error: {mse:.2f}\")\n", | |
" print(f\"R² Score: {r2:.2f}\")\n", | |
" \n", | |
" # Plot actual vs predicted values\n", | |
" plt.figure(figsize=(10, 6))\n", | |
" plt.scatter(y_test, y_pred, alpha=0.5)\n", | |
" plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')\n", | |
" plt.xlabel('Actual Values')\n", | |
" plt.ylabel('Predicted Values')\n", | |
" plt.title('Actual vs Predicted Values')\n", | |
" plt.tight_layout()\n", | |
" plt.show()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} | |