Spaces:

irmchek
/

mynotebooksummary

Sleeping

App Files Files Community

mynotebooksummary / my_notebook.json

irmchek

prototype notebook summarizer

462fea8 5 months ago

raw

history blame contribute delete

4.93 kB

	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"id": 1,
	"source": [
	"# Data Science Analysis Notebook\n",
	"\n",
	"This notebook contains some example Python code for data analysis."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"id": 2,
	"source": [
	"# Import libraries\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import seaborn as sns\n",
	"\n",
	"# Set visualization style\n",
	"sns.set(style='whitegrid')\n",
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"id": 3,
	"source": [
	"# Load the dataset\n",
	"df = pd.read_csv('housing_data.csv')\n",
	"\n",
	"# Display basic information\n",
	"print(f\"Dataset shape: {df.shape}\")\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"id": 4,
	"source": [
	"# Perform data cleaning\n",
	"# Fill missing values with median\n",
	"for column in df.columns:\n",
	" if df[column].dtype in ['float64', 'int64']:\n",
	" df[column].fillna(df[column].median(), inplace=True)\n",
	" else:\n",
	" df[column].fillna(df[column].mode()[0], inplace=True)\n",
	"\n",
	"# Check for remaining missing values\n",
	"print(\"Missing values after cleaning:\")\n",
	"print(df.isnull().sum())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"id": 5,
	"source": [
	"# Exploratory data analysis\n",
	"# Create correlation matrix\n",
	"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
	"correlation_matrix = df[numeric_columns].corr()\n",
	"\n",
	"# Plot heatmap\n",
	"plt.figure(figsize=(12, 10))\n",
	"sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)\n",
	"plt.title('Correlation Matrix of Numeric Features', fontsize=18)\n",
	"plt.xticks(rotation=45, ha='right')\n",
	"plt.tight_layout()\n",
	"plt.show()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"id": 6,
	"source": [
	"# Feature engineering\n",
	"# Create new features\n",
	"if 'bedrooms' in df.columns and 'total_rooms' in df.columns:\n",
	" df['bedrooms_ratio'] = df['bedrooms'] / df['total_rooms']\n",
	"\n",
	"if 'total_rooms' in df.columns and 'households' in df.columns:\n",
	" df['rooms_per_household'] = df['total_rooms'] / df['households']\n",
	"\n",
	"# Scale numeric features\n",
	"from sklearn.preprocessing import StandardScaler\n",
	"scaler = StandardScaler()\n",
	"df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n",
	"\n",
	"# Display transformed data\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"id": 7,
	"source": [
	"# Build a simple prediction model\n",
	"from sklearn.model_selection import train_test_split\n",
	"from sklearn.linear_model import LinearRegression\n",
	"from sklearn.metrics import mean_squared_error, r2_score\n",
	"\n",
	"# Assume we're predicting median_house_value\n",
	"if 'median_house_value' in df.columns:\n",
	" # Prepare features and target\n",
	" X = df.drop('median_house_value', axis=1)\n",
	" y = df['median_house_value']\n",
	" \n",
	" # Split the data\n",
	" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
	" \n",
	" # Train the model\n",
	" model = LinearRegression()\n",
	" model.fit(X_train, y_train)\n",
	" \n",
	" # Make predictions\n",
	" y_pred = model.predict(X_test)\n",
	" \n",
	" # Evaluate the model\n",
	" mse = mean_squared_error(y_test, y_pred)\n",
	" r2 = r2_score(y_test, y_pred)\n",
	" \n",
	" print(f\"Mean Squared Error: {mse:.2f}\")\n",
	" print(f\"R² Score: {r2:.2f}\")\n",
	" \n",
	" # Plot actual vs predicted values\n",
	" plt.figure(figsize=(10, 6))\n",
	" plt.scatter(y_test, y_pred, alpha=0.5)\n",
	" plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')\n",
	" plt.xlabel('Actual Values')\n",
	" plt.ylabel('Predicted Values')\n",
	" plt.title('Actual vs Predicted Values')\n",
	" plt.tight_layout()\n",
	" plt.show()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}