diff --git a/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb new file mode 100644 index 0000000..cd6b4a9 --- /dev/null +++ b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb @@ -0,0 +1,591 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab | Hypothesis Testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Objective**\n", + "\n", + "Welcome to the Hypothesis Testing Lab, where we embark on an enlightening journey through the realm of statistical decision-making! In this laboratory, we delve into various scenarios, applying the powerful tools of hypothesis testing to scrutinize and interpret data.\n", + "\n", + "From testing the mean of a single sample (One Sample T-Test), to investigating differences between independent groups (Two Sample T-Test), and exploring relationships within dependent samples (Paired Sample T-Test), our exploration knows no bounds. Furthermore, we'll venture into the realm of Analysis of Variance (ANOVA), unraveling the complexities of comparing means across multiple groups.\n", + "\n", + "So, grab your statistical tools, prepare your hypotheses, and let's embark on this fascinating journey of exploration and discovery in the world of hypothesis testing!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 1**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with pokemon data. The data can be found here:\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#libraries\n", + "import pandas as pd\n", + "import scipy.stats as st\n", + "import numpy as np\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameType 1Type 2HPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
0BulbasaurGrassPoison4549496565451False
1IvysaurGrassPoison6062638080601False
2VenusaurGrassPoison808283100100801False
3Mega VenusaurGrassPoison80100123122120801False
4CharmanderFireNaN3952436050651False
....................................
795DiancieRockFairy50100150100150506True
796Mega DiancieRockFairy501601101601101106True
797Hoopa ConfinedPsychicGhost8011060150130706True
798Hoopa UnboundPsychicDark8016060170130806True
799VolcanionFireWater8011012013090706True
\n", + "

800 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Name Type 1 Type 2 HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 Bulbasaur Grass Poison 45 49 49 65 65 \n", + "1 Ivysaur Grass Poison 60 62 63 80 80 \n", + "2 Venusaur Grass Poison 80 82 83 100 100 \n", + "3 Mega Venusaur Grass Poison 80 100 123 122 120 \n", + "4 Charmander Fire NaN 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 Diancie Rock Fairy 50 100 150 100 150 \n", + "796 Mega Diancie Rock Fairy 50 160 110 160 110 \n", + "797 Hoopa Confined Psychic Ghost 80 110 60 150 130 \n", + "798 Hoopa Unbound Psychic Dark 80 160 60 170 130 \n", + "799 Volcanion Fire Water 80 110 120 130 90 \n", + "\n", + " Speed Generation Legendary \n", + "0 45 1 False \n", + "1 60 1 False \n", + "2 80 1 False \n", + "3 80 1 False \n", + "4 65 1 False \n", + ".. ... ... ... \n", + "795 50 6 True \n", + "796 110 6 True \n", + "797 70 6 True \n", + "798 80 6 True \n", + "799 70 6 True \n", + "\n", + "[800 rows x 11 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Pokemons of type Dragon have, on average, more HP stats than Grass. Choose the propper test and, with 5% significance, comment your findings." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: 3.3350\n", + "One-sided p-value: 0.0008\n" + ] + } + ], + "source": [ + "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n", + "grass_hp = df[df['Type 1'] == 'Grass']['HP']\n", + "\n", + "t_stat, p_val_two_sided = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)\n", + "\n", + "p_val_one_sided = p_val_two_sided / 2 if t_stat > 0 else 1 - p_val_two_sided / 2\n", + "\n", + "print(f\"T-statistic: {t_stat: .4f}\")\n", + "print(f\"One-sided p-value: {p_val_one_sided: .4f}\")\n", + "\n", + "# If p_val_one_sided < 0.05, we reject H₀, Dragon has a significantly higher mean HP. Otherwise, we fail to reject H₀." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HP | t = 8.9814 | p = 0.0000\n", + "Attack | t = 10.4381 | p = 0.0000\n", + "Defense | t = 7.6371 | p = 0.0000\n", + "Sp. Atk | t = 13.4174 | p = 0.0000\n", + "Sp. Def | t = 10.0157 | p = 0.0000\n", + "Speed | t = 11.4750 | p = 0.0000\n" + ] + } + ], + "source": [ + "legendary = df[df['Legendary'] == True]\n", + "non_legendary = df[df['Legendary'] == False]\n", + "\n", + "stats_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n", + "\n", + "results = []\n", + "for col in stats_cols:\n", + " t_stat, p_val = st.ttest_ind(legendary[col], non_legendary[col], equal_var=False)\n", + " results.append((col, t_stat, p_val))\n", + "\n", + "for col, t_stat, p_val in results:\n", + " print(f\"{col:10} | t = {t_stat:8.4f} | p = {p_val:8.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 2**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with california-housing data. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
0-114.3134.1915.05612.01283.01015.0472.01.493666900.0
1-114.4734.4019.07650.01901.01129.0463.01.820080100.0
2-114.5633.6917.0720.0174.0333.0117.01.650985700.0
3-114.5733.6414.01501.0337.0515.0226.03.191773400.0
4-114.5733.5720.01454.0326.0624.0262.01.925065500.0
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "\n", + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "housing.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We posit that houses close to either a school or a hospital are more expensive.**\n", + "\n", + "- School coordinates (-118, 34)\n", + "- Hospital coordinates (-122, 37)\n", + "\n", + "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "\n", + "Hint:\n", + "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", + "- Divide your dataset into houses close and far from either a hospital or school.\n", + "- Choose the propper test and, with 5% significance, comment your findings.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: 37.9796\n", + "P-value: 0.0000\n" + ] + } + ], + "source": [ + "school_coords = (-118, 34)\n", + "hospital_coords = (-122, 37)\n", + "\n", + "def euclidean_distance(lon, lat, lon_ref, lat_ref):\n", + " return np.sqrt((lon - lon_ref)**2 + (lat - lat_ref)**2)\n", + "\n", + "housing['dist_school'] = euclidean_distance(housing['longitude'], housing['latitude'], school_coords[0], school_coords[1])\n", + "housing['dist_hospital'] = euclidean_distance(housing['longitude'], housing['latitude'], hospital_coords[0], hospital_coords[1])\n", + "\n", + "housing['close'] = (housing['dist_school'] <= 0.50) | (housing['dist_hospital'] <= 0.50)\n", + "\n", + "close_prices = housing[housing['close']]['median_house_value']\n", + "far_prices = housing[~housing['close']]['median_house_value']\n", + "\n", + "t_stat, p_val = st.ttest_ind(close_prices, far_prices, equal_var=False)\n", + "\n", + "print(f\"T-statistic: {t_stat: .4f}\")\n", + "print(f\"P-value: {p_val: .4f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..cd6b4a9 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -297,11 +297,30 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: 3.3350\n", + "One-sided p-value: 0.0008\n" + ] + } + ], "source": [ - "#code here" + "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n", + "grass_hp = df[df['Type 1'] == 'Grass']['HP']\n", + "\n", + "t_stat, p_val_two_sided = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)\n", + "\n", + "p_val_one_sided = p_val_two_sided / 2 if t_stat > 0 else 1 - p_val_two_sided / 2\n", + "\n", + "print(f\"T-statistic: {t_stat: .4f}\")\n", + "print(f\"One-sided p-value: {p_val_one_sided: .4f}\")\n", + "\n", + "# If p_val_one_sided < 0.05, we reject H₀, Dragon has a significantly higher mean HP. Otherwise, we fail to reject H₀." ] }, { @@ -313,11 +332,35 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HP | t = 8.9814 | p = 0.0000\n", + "Attack | t = 10.4381 | p = 0.0000\n", + "Defense | t = 7.6371 | p = 0.0000\n", + "Sp. Atk | t = 13.4174 | p = 0.0000\n", + "Sp. Def | t = 10.0157 | p = 0.0000\n", + "Speed | t = 11.4750 | p = 0.0000\n" + ] + } + ], "source": [ - "#code here" + "legendary = df[df['Legendary'] == True]\n", + "non_legendary = df[df['Legendary'] == False]\n", + "\n", + "stats_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n", + "\n", + "results = []\n", + "for col in stats_cols:\n", + " t_stat, p_val = st.ttest_ind(legendary[col], non_legendary[col], equal_var=False)\n", + " results.append((col, t_stat, p_val))\n", + "\n", + "for col, t_stat, p_val in results:\n", + " print(f\"{col:10} | t = {t_stat:8.4f} | p = {p_val:8.4f}\")" ] }, { @@ -337,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -453,14 +496,14 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", - "df.head()" + "housing = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "housing.head()" ] }, { @@ -483,10 +526,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "T-statistic: 37.9796\n", + "P-value: 0.0000\n" + ] + } + ], + "source": [ + "school_coords = (-118, 34)\n", + "hospital_coords = (-122, 37)\n", + "\n", + "def euclidean_distance(lon, lat, lon_ref, lat_ref):\n", + " return np.sqrt((lon - lon_ref)**2 + (lat - lat_ref)**2)\n", + "\n", + "housing['dist_school'] = euclidean_distance(housing['longitude'], housing['latitude'], school_coords[0], school_coords[1])\n", + "housing['dist_hospital'] = euclidean_distance(housing['longitude'], housing['latitude'], hospital_coords[0], hospital_coords[1])\n", + "\n", + "housing['close'] = (housing['dist_school'] <= 0.50) | (housing['dist_hospital'] <= 0.50)\n", + "\n", + "close_prices = housing[housing['close']]['median_house_value']\n", + "far_prices = housing[~housing['close']]['median_house_value']\n", + "\n", + "t_stat, p_val = st.ttest_ind(close_prices, far_prices, equal_var=False)\n", + "\n", + "print(f\"T-statistic: {t_stat: .4f}\")\n", + "print(f\"P-value: {p_val: .4f}\")\n" + ] }, { "cell_type": "code", @@ -512,9 +583,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.7" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }