data-bootcamp-v4 · Yilak-maker · Oct 5, 2025
diff --git a/lab_DataStructure_combining.ipynb b/lab_DataStructure_combining.ipynb
@@ -0,0 +1,313 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "43a0c7ff-ef0f-460f-9594-d4e5f25e5cce",
+   "metadata": {},
+   "source": [
+    "## Challenge 1: Combining & Cleaning Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c5a20973-0455-441e-a076-d98f8d3fd2e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Customer          ST GENDER             Education Customer Lifetime Value  \\\n",
+      "0  RB50392  Washington    NaN                Master                     NaN   \n",
+      "1  QZ44356     Arizona      F              Bachelor              697953.59%   \n",
+      "2  AI49188      Nevada      F              Bachelor             1288743.17%   \n",
+      "3  WW63253  California      M              Bachelor              764586.18%   \n",
+      "4  GA49547  Washington      M  High School or Below              536307.65%   \n",
+      "\n",
+      "    Income  Monthly Premium Auto Number of Open Complaints     Policy Type  \\\n",
+      "0      0.0                1000.0                    1/0/00   Personal Auto   \n",
+      "1      0.0                  94.0                    1/0/00   Personal Auto   \n",
+      "2  48767.0                 108.0                    1/0/00   Personal Auto   \n",
+      "3      0.0                 106.0                    1/0/00  Corporate Auto   \n",
+      "4  36357.0                  68.0                    1/0/00   Personal Auto   \n",
+      "\n",
+      "   Vehicle Class  Total Claim Amount  \n",
+      "0  Four-Door Car            2.704934  \n",
+      "1  Four-Door Car         1131.464935  \n",
+      "2   Two-Door Car          566.472247  \n",
+      "3            SUV          529.881344  \n",
+      "4  Four-Door Car           17.269323     Customer          ST GENDER Education Customer Lifetime Value  Income  \\\n",
+      "0  GS98873     Arizona      F  Bachelor              323912.47%   16061   \n",
+      "1  CW49887  California      F    Master              462680.11%   79487   \n",
+      "2  MY31220  California      F   College              899704.02%   54230   \n",
+      "3  UH35128      Oregon      F   College             2580706.30%   71210   \n",
+      "4  WH52799     Arizona      F   College              380812.21%   94903   \n",
+      "\n",
+      "   Monthly Premium Auto Number of Open Complaints  Total Claim Amount  \\\n",
+      "0                    88                    1/0/00               633.6   \n",
+      "1                   114                    1/0/00               547.2   \n",
+      "2                   112                    1/0/00               537.6   \n",
+      "3                   214                    1/1/00              1027.2   \n",
+      "4                    94                    1/0/00               451.2   \n",
+      "\n",
+      "      Policy Type  Vehicle Class  \n",
+      "0   Personal Auto  Four-Door Car  \n",
+      "1    Special Auto            SUV  \n",
+      "2   Personal Auto   Two-Door Car  \n",
+      "3   Personal Auto     Luxury Car  \n",
+      "4  Corporate Auto   Two-Door Car     Customer       State  Customer Lifetime Value             Education Gender  \\\n",
+      "0  SA25987  Washington              3479.137523  High School or Below      M   \n",
+      "1  TB86706     Arizona              2502.637401                Master      M   \n",
+      "2  ZL73902      Nevada              3265.156348              Bachelor      F   \n",
+      "3  KX23516  California              4455.843406  High School or Below      F   \n",
+      "4  FN77294  California              7704.958480  High School or Below      M   \n",
+      "\n",
+      "   Income  Monthly Premium Auto  Number of Open Complaints    Policy Type  \\\n",
+      "0       0                   104                          0  Personal Auto   \n",
+      "1       0                    66                          0  Personal Auto   \n",
+      "2   25820                    82                          0  Personal Auto   \n",
+      "3       0                   121                          0  Personal Auto   \n",
+      "4   30366                   101                          2  Personal Auto   \n",
+      "\n",
+      "   Total Claim Amount  Vehicle Class  \n",
+      "0          499.200000   Two-Door Car  \n",
+      "1            3.468912   Two-Door Car  \n",
+      "2          393.600000  Four-Door Car  \n",
+      "3          699.615192            SUV  \n",
+      "4          484.800000            SUV  \n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# URLs\n",
+    "file1_url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\"\n",
+    "file2_url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\"\n",
+    "file3_url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\"\n",
+    "\n",
+    "# Read CSVs\n",
+    "df1 = pd.read_csv(file1_url)\n",
+    "df2 = pd.read_csv(file2_url)\n",
+    "df3 = pd.read_csv(file3_url)\n",
+    "\n",
+    "# Optional: quick preview\n",
+    "print(df1.head(), df2.head(), df3.head())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09e0f4cd-2c6c-4bfe-98a5-b4880f68a197",
+   "metadata": {},
+   "source": [
+    "## Step 1: Clean the datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5441cb5a-5822-4db9-8326-f0f5d97916d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_df(df):\n",
+    "    # Strip column names\n",
+    "    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')\n",
+    "    # Remove duplicates\n",
+    "    df = df.drop_duplicates()\n",
+    "    # Fill missing values (example: fill numeric NaN with 0, string NaN with 'Unknown')\n",
+    "    for col in df.select_dtypes(include='number').columns:\n",
+    "        df[col] = df[col].fillna(0)\n",
+    "    for col in df.select_dtypes(include='object').columns:\n",
+    "        df[col] = df[col].fillna('Unknown')\n",
+    "    return df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a1654609-096d-4daf-91cc-a16c81b961c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\kyila\\AppData\\Local\\Temp\\ipykernel_59716\\3053997771.py:8: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df[col] = df[col].fillna(0)\n",
+      "C:\\Users\\kyila\\AppData\\Local\\Temp\\ipykernel_59716\\3053997771.py:10: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df[col] = df[col].fillna('Unknown')\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Apply cleaning\n",
+    "df1_clean = clean_df(df1)\n",
+    "df2_clean = clean_df(df2)\n",
+    "df3_clean = clean_df(df3)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "29fc4ab1-6e7f-4261-bd86-f9a3c98e9b83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Combine datasets\n",
+    "combined_df = pd.concat([df1_clean, df2_clean, df3_clean], ignore_index=True)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6d01613a-1438-473b-92e9-576b446f4350",
+   "metadata": {},
+   "source": [
+    "## Challenge 2- structuring data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1b563ce0-6a7f-4d45-a275-3d0567fd5f36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "marketing_url = \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\"\n",
+    "marketing_df = pd.read_csv(marketing_url)\n",
+    "\n",
+    "# Quick clean: lowercase column names and strip spaces\n",
+    "marketing_df.columns = marketing_df.columns.str.strip().str.lower().str.replace(' ', '_')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d39a1bb1-0f59-44a1-ba96-bf1dd8f3d240",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "               total_claim_amount\n",
+      "sales_channel                    \n",
+      "Agent                  1810226.82\n",
+      "Branch                 1301204.00\n",
+      "Call Center             926600.82\n",
+      "Web                     706600.04\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Total revenue per sales channel\n",
+    "# Pivot table: total revenue per sales channel\n",
+    "revenue_by_channel = pd.pivot_table(\n",
+    "    marketing_df,\n",
+    "    values='total_claim_amount',  # replace with revenue column if different\n",
+    "    index='sales_channel',        # column indicating branch, web, call center, mail\n",
+    "    aggfunc='sum'\n",
+    ").round(2)\n",
+    "\n",
+    "print(revenue_by_channel)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "96aa97ca-88ae-40fb-afe0-34f3169b8096",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "education  Bachelor  College   Doctor  High School or Below   Master\n",
+      "gender                                                              \n",
+      "F           7874.27  7748.82  7328.51               8675.22  8157.05\n",
+      "M           7703.60  8052.46  7415.33               8149.69  8168.83\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Average customer lifetime value per gender and education\n",
+    "avg_clv = pd.pivot_table(\n",
+    "    marketing_df,\n",
+    "    values='customer_lifetime_value',  # numeric column\n",
+    "    index='gender',\n",
+    "    columns='education',\n",
+    "    aggfunc='mean'\n",
+    ").round(2)\n",
+    "\n",
+    "print(avg_clv)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "e4b26496-6d92-43ec-9a68-b84361f0e9aa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "      policy_type  month  num_complaints\n",
+      "0  Corporate Auto      1            1252\n",
+      "1  Corporate Auto      2            1089\n",
+      "2   Personal Auto      1            4329\n",
+      "3   Personal Auto      2            3799\n",
+      "4    Special Auto      1             237\n",
+      "5    Special Auto      2             204\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Number of complaints by policy type and month\n",
+    "# Count complaints by policy_type and month\n",
+    "complaints_summary = marketing_df.groupby(['policy_type', 'month']).size().reset_index(name='num_complaints')\n",
+    "\n",
+    "# This is already in long format:\n",
+    "print(complaints_summary)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32a8d2f9-a466-46db-98b4-1e5035c41130",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}