mynotebooksummary / my_notebook.json
irmchek's picture
prototype notebook summarizer
462fea8
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"id": 1,
"source": [
"# Data Science Analysis Notebook\n",
"\n",
"This notebook contains some example Python code for data analysis."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"id": 2,
"source": [
"# Import libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Set visualization style\n",
"sns.set(style='whitegrid')\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"id": 3,
"source": [
"# Load the dataset\n",
"df = pd.read_csv('housing_data.csv')\n",
"\n",
"# Display basic information\n",
"print(f\"Dataset shape: {df.shape}\")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"id": 4,
"source": [
"# Perform data cleaning\n",
"# Fill missing values with median\n",
"for column in df.columns:\n",
" if df[column].dtype in ['float64', 'int64']:\n",
" df[column].fillna(df[column].median(), inplace=True)\n",
" else:\n",
" df[column].fillna(df[column].mode()[0], inplace=True)\n",
"\n",
"# Check for remaining missing values\n",
"print(\"Missing values after cleaning:\")\n",
"print(df.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"id": 5,
"source": [
"# Exploratory data analysis\n",
"# Create correlation matrix\n",
"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
"correlation_matrix = df[numeric_columns].corr()\n",
"\n",
"# Plot heatmap\n",
"plt.figure(figsize=(12, 10))\n",
"sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)\n",
"plt.title('Correlation Matrix of Numeric Features', fontsize=18)\n",
"plt.xticks(rotation=45, ha='right')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"id": 6,
"source": [
"# Feature engineering\n",
"# Create new features\n",
"if 'bedrooms' in df.columns and 'total_rooms' in df.columns:\n",
" df['bedrooms_ratio'] = df['bedrooms'] / df['total_rooms']\n",
"\n",
"if 'total_rooms' in df.columns and 'households' in df.columns:\n",
" df['rooms_per_household'] = df['total_rooms'] / df['households']\n",
"\n",
"# Scale numeric features\n",
"from sklearn.preprocessing import StandardScaler\n",
"scaler = StandardScaler()\n",
"df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n",
"\n",
"# Display transformed data\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"id": 7,
"source": [
"# Build a simple prediction model\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"\n",
"# Assume we're predicting median_house_value\n",
"if 'median_house_value' in df.columns:\n",
" # Prepare features and target\n",
" X = df.drop('median_house_value', axis=1)\n",
" y = df['median_house_value']\n",
" \n",
" # Split the data\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
" \n",
" # Train the model\n",
" model = LinearRegression()\n",
" model.fit(X_train, y_train)\n",
" \n",
" # Make predictions\n",
" y_pred = model.predict(X_test)\n",
" \n",
" # Evaluate the model\n",
" mse = mean_squared_error(y_test, y_pred)\n",
" r2 = r2_score(y_test, y_pred)\n",
" \n",
" print(f\"Mean Squared Error: {mse:.2f}\")\n",
" print(f\"R² Score: {r2:.2f}\")\n",
" \n",
" # Plot actual vs predicted values\n",
" plt.figure(figsize=(10, 6))\n",
" plt.scatter(y_test, y_pred, alpha=0.5)\n",
" plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')\n",
" plt.xlabel('Actual Values')\n",
" plt.ylabel('Predicted Values')\n",
" plt.title('Actual vs Predicted Values')\n",
" plt.tight_layout()\n",
" plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}