Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
313 changes: 313 additions & 0 deletions lab_DataStructure_combining.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "43a0c7ff-ef0f-460f-9594-d4e5f25e5cce",
"metadata": {},
"source": [
"## Challenge 1: Combining & Cleaning Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c5a20973-0455-441e-a076-d98f8d3fd2e3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Customer ST GENDER Education Customer Lifetime Value \\\n",
"0 RB50392 Washington NaN Master NaN \n",
"1 QZ44356 Arizona F Bachelor 697953.59% \n",
"2 AI49188 Nevada F Bachelor 1288743.17% \n",
"3 WW63253 California M Bachelor 764586.18% \n",
"4 GA49547 Washington M High School or Below 536307.65% \n",
"\n",
" Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n",
"0 0.0 1000.0 1/0/00 Personal Auto \n",
"1 0.0 94.0 1/0/00 Personal Auto \n",
"2 48767.0 108.0 1/0/00 Personal Auto \n",
"3 0.0 106.0 1/0/00 Corporate Auto \n",
"4 36357.0 68.0 1/0/00 Personal Auto \n",
"\n",
" Vehicle Class Total Claim Amount \n",
"0 Four-Door Car 2.704934 \n",
"1 Four-Door Car 1131.464935 \n",
"2 Two-Door Car 566.472247 \n",
"3 SUV 529.881344 \n",
"4 Four-Door Car 17.269323 Customer ST GENDER Education Customer Lifetime Value Income \\\n",
"0 GS98873 Arizona F Bachelor 323912.47% 16061 \n",
"1 CW49887 California F Master 462680.11% 79487 \n",
"2 MY31220 California F College 899704.02% 54230 \n",
"3 UH35128 Oregon F College 2580706.30% 71210 \n",
"4 WH52799 Arizona F College 380812.21% 94903 \n",
"\n",
" Monthly Premium Auto Number of Open Complaints Total Claim Amount \\\n",
"0 88 1/0/00 633.6 \n",
"1 114 1/0/00 547.2 \n",
"2 112 1/0/00 537.6 \n",
"3 214 1/1/00 1027.2 \n",
"4 94 1/0/00 451.2 \n",
"\n",
" Policy Type Vehicle Class \n",
"0 Personal Auto Four-Door Car \n",
"1 Special Auto SUV \n",
"2 Personal Auto Two-Door Car \n",
"3 Personal Auto Luxury Car \n",
"4 Corporate Auto Two-Door Car Customer State Customer Lifetime Value Education Gender \\\n",
"0 SA25987 Washington 3479.137523 High School or Below M \n",
"1 TB86706 Arizona 2502.637401 Master M \n",
"2 ZL73902 Nevada 3265.156348 Bachelor F \n",
"3 KX23516 California 4455.843406 High School or Below F \n",
"4 FN77294 California 7704.958480 High School or Below M \n",
"\n",
" Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n",
"0 0 104 0 Personal Auto \n",
"1 0 66 0 Personal Auto \n",
"2 25820 82 0 Personal Auto \n",
"3 0 121 0 Personal Auto \n",
"4 30366 101 2 Personal Auto \n",
"\n",
" Total Claim Amount Vehicle Class \n",
"0 499.200000 Two-Door Car \n",
"1 3.468912 Two-Door Car \n",
"2 393.600000 Four-Door Car \n",
"3 699.615192 SUV \n",
"4 484.800000 SUV \n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# URLs\n",
"file1_url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n",
"file2_url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n",
"file3_url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n",
"\n",
"# Read CSVs\n",
"df1 = pd.read_csv(file1_url)\n",
"df2 = pd.read_csv(file2_url)\n",
"df3 = pd.read_csv(file3_url)\n",
"\n",
"# Optional: quick preview\n",
"print(df1.head(), df2.head(), df3.head())\n"
]
},
{
"cell_type": "markdown",
"id": "09e0f4cd-2c6c-4bfe-98a5-b4880f68a197",
"metadata": {},
"source": [
"## Step 1: Clean the datasets"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5441cb5a-5822-4db9-8326-f0f5d97916d8",
"metadata": {},
"outputs": [],
"source": [
"def clean_df(df):\n",
" # Strip column names\n",
" df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')\n",
" # Remove duplicates\n",
" df = df.drop_duplicates()\n",
" # Fill missing values (example: fill numeric NaN with 0, string NaN with 'Unknown')\n",
" for col in df.select_dtypes(include='number').columns:\n",
" df[col] = df[col].fillna(0)\n",
" for col in df.select_dtypes(include='object').columns:\n",
" df[col] = df[col].fillna('Unknown')\n",
" return df\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a1654609-096d-4daf-91cc-a16c81b961c4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\kyila\\AppData\\Local\\Temp\\ipykernel_59716\\3053997771.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df[col] = df[col].fillna(0)\n",
"C:\\Users\\kyila\\AppData\\Local\\Temp\\ipykernel_59716\\3053997771.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df[col] = df[col].fillna('Unknown')\n"
]
}
],
"source": [
"#Apply cleaning\n",
"df1_clean = clean_df(df1)\n",
"df2_clean = clean_df(df2)\n",
"df3_clean = clean_df(df3)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "29fc4ab1-6e7f-4261-bd86-f9a3c98e9b83",
"metadata": {},
"outputs": [],
"source": [
"#Combine datasets\n",
"combined_df = pd.concat([df1_clean, df2_clean, df3_clean], ignore_index=True)\n"
]
},
{
"cell_type": "markdown",
"id": "6d01613a-1438-473b-92e9-576b446f4350",
"metadata": {},
"source": [
"## Challenge 2- structuring data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1b563ce0-6a7f-4d45-a275-3d0567fd5f36",
"metadata": {},
"outputs": [],
"source": [
"marketing_url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n",
"marketing_df = pd.read_csv(marketing_url)\n",
"\n",
"# Quick clean: lowercase column names and strip spaces\n",
"marketing_df.columns = marketing_df.columns.str.strip().str.lower().str.replace(' ', '_')\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d39a1bb1-0f59-44a1-ba96-bf1dd8f3d240",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" total_claim_amount\n",
"sales_channel \n",
"Agent 1810226.82\n",
"Branch 1301204.00\n",
"Call Center 926600.82\n",
"Web 706600.04\n"
]
}
],
"source": [
"#Total revenue per sales channel\n",
"# Pivot table: total revenue per sales channel\n",
"revenue_by_channel = pd.pivot_table(\n",
" marketing_df,\n",
" values='total_claim_amount', # replace with revenue column if different\n",
" index='sales_channel', # column indicating branch, web, call center, mail\n",
" aggfunc='sum'\n",
").round(2)\n",
"\n",
"print(revenue_by_channel)\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "96aa97ca-88ae-40fb-afe0-34f3169b8096",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"education Bachelor College Doctor High School or Below Master\n",
"gender \n",
"F 7874.27 7748.82 7328.51 8675.22 8157.05\n",
"M 7703.60 8052.46 7415.33 8149.69 8168.83\n"
]
}
],
"source": [
"# Average customer lifetime value per gender and education\n",
"avg_clv = pd.pivot_table(\n",
" marketing_df,\n",
" values='customer_lifetime_value', # numeric column\n",
" index='gender',\n",
" columns='education',\n",
" aggfunc='mean'\n",
").round(2)\n",
"\n",
"print(avg_clv)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e4b26496-6d92-43ec-9a68-b84361f0e9aa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" policy_type month num_complaints\n",
"0 Corporate Auto 1 1252\n",
"1 Corporate Auto 2 1089\n",
"2 Personal Auto 1 4329\n",
"3 Personal Auto 2 3799\n",
"4 Special Auto 1 237\n",
"5 Special Auto 2 204\n"
]
}
],
"source": [
"#Number of complaints by policy type and month\n",
"# Count complaints by policy_type and month\n",
"complaints_summary = marketing_df.groupby(['policy_type', 'month']).size().reset_index(name='num_complaints')\n",
"\n",
"# This is already in long format:\n",
"print(complaints_summary)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "32a8d2f9-a466-46db-98b4-1e5035c41130",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}