diff --git a/LAB Pandas Deep Dive.ipynb b/LAB Pandas Deep Dive.ipynb new file mode 100644 index 0000000..817b1c8 --- /dev/null +++ b/LAB Pandas Deep Dive.ipynb @@ -0,0 +1,1869 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas Deep-Dive\n", + "\n", + "#### 1. Import Pandas package under the alias pd and Numpy under the alias np." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Define a variable called `path` that contains the path to the csv file you downloaded. If you are reading the data from Ironhack's database, define the variables needed to create the connection with the database. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'C:\\\\Users\\\\dulce\\\\Downloads\\\\apple_store.csv'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path = r\"C:\\Users\\dulce\\Downloads\\apple_store.csv\"\n", + "path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Using the pandas, read the data either from the downladed file or the database. \n", + "\n", + "Assign the returned value to a variable called `data`." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtrack_namesize_bytespricerating_count_totrating_count_veruser_ratinguser_rating_verprime_genre
0281656475PAC-MAN Premium1007882243.9921292264.04.5Games
1281796108Evernote - stay organized1585786880.00161065264.03.5Productivity
2281940292WeatherBug - Local Weather, Radar, Maps, Alerts1005240320.0018858328223.54.5Weather
3282614216eBay: Best App to Buy, Sell, Save! Online Shop...1285120000.002622416494.04.5Shopping
4282935706Bible927744000.0098592053204.55.0Reference
..............................
71921187617475Kubik1266442240.00142754.54.5Games
71931187682390VR Roller-Coaster1207603200.0030304.54.5Games
71941187779532Bret Michaels Emojis + Lyric Keyboard1113221121.991504.50.0Utilities
71951187838770VR Roller Coaster World - Virtual Reality972359680.0085324.54.5Games
71961188375727Escape the Sweet Shop Series908984320.00335.05.0Games
\n", + "

7197 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " id track_name \\\n", + "0 281656475 PAC-MAN Premium \n", + "1 281796108 Evernote - stay organized \n", + "2 281940292 WeatherBug - Local Weather, Radar, Maps, Alerts \n", + "3 282614216 eBay: Best App to Buy, Sell, Save! Online Shop... \n", + "4 282935706 Bible \n", + "... ... ... \n", + "7192 1187617475 Kubik \n", + "7193 1187682390 VR Roller-Coaster \n", + "7194 1187779532 Bret Michaels Emojis + Lyric Keyboard \n", + "7195 1187838770 VR Roller Coaster World - Virtual Reality \n", + "7196 1188375727 Escape the Sweet Shop Series \n", + "\n", + " size_bytes price rating_count_tot rating_count_ver user_rating \\\n", + "0 100788224 3.99 21292 26 4.0 \n", + "1 158578688 0.00 161065 26 4.0 \n", + "2 100524032 0.00 188583 2822 3.5 \n", + "3 128512000 0.00 262241 649 4.0 \n", + "4 92774400 0.00 985920 5320 4.5 \n", + "... ... ... ... ... ... \n", + "7192 126644224 0.00 142 75 4.5 \n", + "7193 120760320 0.00 30 30 4.5 \n", + "7194 111322112 1.99 15 0 4.5 \n", + "7195 97235968 0.00 85 32 4.5 \n", + "7196 90898432 0.00 3 3 5.0 \n", + "\n", + " user_rating_ver prime_genre \n", + "0 4.5 Games \n", + "1 3.5 Productivity \n", + "2 4.5 Weather \n", + "3 4.5 Shopping \n", + "4 5.0 Reference \n", + "... ... ... \n", + "7192 4.5 Games \n", + "7193 4.5 Games \n", + "7194 0.0 Utilities \n", + "7195 4.5 Games \n", + "7196 5.0 Games \n", + "\n", + "[7197 rows x 9 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv(r\"C:\\Users\\dulce\\Downloads\\apple_store.csv\")\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Print the first 5 rows of `data` to see what the data look like.\n", + "\n", + "A data analyst usually does this to have a general understanding about what the data look like before digging deep." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtrack_namesize_bytespricerating_count_totrating_count_veruser_ratinguser_rating_verprime_genre
0281656475PAC-MAN Premium1007882243.9921292264.04.5Games
1281796108Evernote - stay organized1585786880.00161065264.03.5Productivity
2281940292WeatherBug - Local Weather, Radar, Maps, Alerts1005240320.0018858328223.54.5Weather
3282614216eBay: Best App to Buy, Sell, Save! Online Shop...1285120000.002622416494.04.5Shopping
4282935706Bible927744000.0098592053204.55.0Reference
\n", + "
" + ], + "text/plain": [ + " id track_name size_bytes \\\n", + "0 281656475 PAC-MAN Premium 100788224 \n", + "1 281796108 Evernote - stay organized 158578688 \n", + "2 281940292 WeatherBug - Local Weather, Radar, Maps, Alerts 100524032 \n", + "3 282614216 eBay: Best App to Buy, Sell, Save! Online Shop... 128512000 \n", + "4 282935706 Bible 92774400 \n", + "\n", + " price rating_count_tot rating_count_ver user_rating user_rating_ver \\\n", + "0 3.99 21292 26 4.0 4.5 \n", + "1 0.00 161065 26 4.0 3.5 \n", + "2 0.00 188583 2822 3.5 4.5 \n", + "3 0.00 262241 649 4.0 4.5 \n", + "4 0.00 985920 5320 4.5 5.0 \n", + "\n", + " prime_genre \n", + "0 Games \n", + "1 Productivity \n", + "2 Weather \n", + "3 Shopping \n", + "4 Reference " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5. Print the summary of the data." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id track_name \\\n", + "0 281656475 PAC-MAN Premium \n", + "1 281796108 Evernote - stay organized \n", + "2 281940292 WeatherBug - Local Weather, Radar, Maps, Alerts \n", + "3 282614216 eBay: Best App to Buy, Sell, Save! Online Shop... \n", + "4 282935706 Bible \n", + "... ... ... \n", + "7192 1187617475 Kubik \n", + "7193 1187682390 VR Roller-Coaster \n", + "7194 1187779532 Bret Michaels Emojis + Lyric Keyboard \n", + "7195 1187838770 VR Roller Coaster World - Virtual Reality \n", + "7196 1188375727 Escape the Sweet Shop Series \n", + "\n", + " size_bytes price rating_count_tot rating_count_ver user_rating \\\n", + "0 100788224 3.99 21292 26 4.0 \n", + "1 158578688 0.00 161065 26 4.0 \n", + "2 100524032 0.00 188583 2822 3.5 \n", + "3 128512000 0.00 262241 649 4.0 \n", + "4 92774400 0.00 985920 5320 4.5 \n", + "... ... ... ... ... ... \n", + "7192 126644224 0.00 142 75 4.5 \n", + "7193 120760320 0.00 30 30 4.5 \n", + "7194 111322112 1.99 15 0 4.5 \n", + "7195 97235968 0.00 85 32 4.5 \n", + "7196 90898432 0.00 3 3 5.0 \n", + "\n", + " user_rating_ver prime_genre \n", + "0 4.5 Games \n", + "1 3.5 Productivity \n", + "2 4.5 Weather \n", + "3 4.5 Shopping \n", + "4 5.0 Reference \n", + "... ... ... \n", + "7192 4.5 Games \n", + "7193 4.5 Games \n", + "7194 0.0 Utilities \n", + "7195 4.5 Games \n", + "7196 5.0 Games \n", + "\n", + "[7197 rows x 9 columns]\n" + ] + } + ], + "source": [ + "print(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6. Print the number of columns in the data." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_columns = data.shape[1]\n", + "num_columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 7. Print all column names." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'track_name', 'size_bytes', 'price', 'rating_count_tot',\n", + " 'rating_count_ver', 'user_rating', 'user_rating_ver', 'prime_genre'],\n", + " dtype='object')" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 8.- Now that we have a general understanding of the data, we'll start working on the challenge questions. How many apps are there in the data source? Print the number of observations of the data.\n", + "\n", + "**Hint**: Your code should return the number 7197." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7197" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "number_of_apps = data.shape[0]\n", + "number_of_apps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 9. What is the average rating of all apps? \n", + "\n", + "First, read the `user_rating` column into a varialbe named `user_rating`." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 4.0\n", + "1 4.0\n", + "2 3.5\n", + "3 4.0\n", + "4 4.5\n", + " ... \n", + "7192 4.5\n", + "7193 4.5\n", + "7194 4.5\n", + "7195 4.5\n", + "7196 5.0\n", + "Name: user_rating, Length: 7197, dtype: float64" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_rating = data['user_rating']\n", + "user_rating" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you can calculate the average of the `user_rating` data.\n", + "\n", + "**Hint**: Your code should return 3.526955675976101." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.526955675976101" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_rating = data['user_rating'].mean()\n", + "user_rating" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 10. How many apps have an average rating no less than 4?\n", + "\n", + "First, filter `user_rating` where its value >= 4. \n", + "\n", + "Assign the filtered dataframe to a new variable called `user_rating_high`." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtrack_namesize_bytespricerating_count_totrating_count_veruser_ratinguser_rating_verprime_genre
0281656475PAC-MAN Premium1007882243.9921292264.04.5Games
1281796108Evernote - stay organized1585786880.00161065264.03.5Productivity
3282614216eBay: Best App to Buy, Sell, Save! Online Shop...1285120000.002622416494.04.5Shopping
4282935706Bible927744000.0098592053204.55.0Reference
5283619399Shanghai Mahjong104857130.99825355164.04.0Games
..............................
71921187617475Kubik1266442240.00142754.54.5Games
71931187682390VR Roller-Coaster1207603200.0030304.54.5Games
71941187779532Bret Michaels Emojis + Lyric Keyboard1113221121.991504.50.0Utilities
71951187838770VR Roller Coaster World - Virtual Reality972359680.0085324.54.5Games
71961188375727Escape the Sweet Shop Series908984320.00335.05.0Games
\n", + "

4781 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " id track_name \\\n", + "0 281656475 PAC-MAN Premium \n", + "1 281796108 Evernote - stay organized \n", + "3 282614216 eBay: Best App to Buy, Sell, Save! Online Shop... \n", + "4 282935706 Bible \n", + "5 283619399 Shanghai Mahjong \n", + "... ... ... \n", + "7192 1187617475 Kubik \n", + "7193 1187682390 VR Roller-Coaster \n", + "7194 1187779532 Bret Michaels Emojis + Lyric Keyboard \n", + "7195 1187838770 VR Roller Coaster World - Virtual Reality \n", + "7196 1188375727 Escape the Sweet Shop Series \n", + "\n", + " size_bytes price rating_count_tot rating_count_ver user_rating \\\n", + "0 100788224 3.99 21292 26 4.0 \n", + "1 158578688 0.00 161065 26 4.0 \n", + "3 128512000 0.00 262241 649 4.0 \n", + "4 92774400 0.00 985920 5320 4.5 \n", + "5 10485713 0.99 8253 5516 4.0 \n", + "... ... ... ... ... ... \n", + "7192 126644224 0.00 142 75 4.5 \n", + "7193 120760320 0.00 30 30 4.5 \n", + "7194 111322112 1.99 15 0 4.5 \n", + "7195 97235968 0.00 85 32 4.5 \n", + "7196 90898432 0.00 3 3 5.0 \n", + "\n", + " user_rating_ver prime_genre \n", + "0 4.5 Games \n", + "1 3.5 Productivity \n", + "3 4.5 Shopping \n", + "4 5.0 Reference \n", + "5 4.0 Games \n", + "... ... ... \n", + "7192 4.5 Games \n", + "7193 4.5 Games \n", + "7194 0.0 Utilities \n", + "7195 4.5 Games \n", + "7196 5.0 Games \n", + "\n", + "[4781 rows x 9 columns]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_rating_high = data[data['user_rating'] >= 4]\n", + "user_rating_high" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now obtain the length of `user_rating_high` which should return 4781." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4781" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(user_rating_high)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 11. How many genres are there in total for all the apps?\n", + "\n", + "Define a new variable named `genres` that contains the `prime_genre` column of `data`. Google for how to obtain unique values of a dataframe column. " + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
0Games
1Productivity
2Weather
3Shopping
4Reference
5Finance
6Music
7Utilities
8Travel
9Social Networking
10Sports
11Business
12Health & Fitness
13Entertainment
14Photo & Video
15Navigation
16Education
17Lifestyle
18Food & Drink
19News
20Book
21Medical
22Catalogs
\n", + "
" + ], + "text/plain": [ + " 0\n", + "0 Games\n", + "1 Productivity\n", + "2 Weather\n", + "3 Shopping\n", + "4 Reference\n", + "5 Finance\n", + "6 Music\n", + "7 Utilities\n", + "8 Travel\n", + "9 Social Networking\n", + "10 Sports\n", + "11 Business\n", + "12 Health & Fitness\n", + "13 Entertainment\n", + "14 Photo & Video\n", + "15 Navigation\n", + "16 Education\n", + "17 Lifestyle\n", + "18 Food & Drink\n", + "19 News\n", + "20 Book\n", + "21 Medical\n", + "22 Catalogs" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genres = data['prime_genre'].unique()\n", + "genres\n", + "\n", + "pd.DataFrame(genres)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print the length of the unique values of `genres`. Your code should return 23." + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "23" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genres = data['prime_genre'].unique()\n", + "\n", + "unique_genres = len(genres)\n", + "unique_genres" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 12. What are the top 3 genres that have the most number of apps?\n", + "\n", + "What you want to do is to count the number of occurrences of each unique genre values. Because you already know how to obtain the unique genre values, you can of course count the # of apps of each genre one by one. However, Pandas has a convient function to let you count all values of a dataframe column with a single command. Google for \"pandas count values\" to find the solution. Your code should return the following:\n", + "\n", + "```\n", + "Games 3862\n", + "Entertainment 535\n", + "Education 453\n", + "Name: prime_genre, dtype: int64\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Games 3862\n", + "Entertainment 535\n", + "Education 453\n", + "Name: prime_genre, dtype: int64" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "count_of_genres = data['prime_genre'].value_counts()\n", + "\n", + "top_3_genres = count_of_genres.head(3)\n", + "top_3_genres" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 13. Which genre is most likely to contain free apps?\n", + "\n", + "First, filter `data` where the price is 0.00. Assign the filtered data to a new variable called `free_apps`. Then count the values in `free_apps`. Your code should return:\n", + "\n", + "```\n", + "Games 2257\n", + "Entertainment 334\n", + "Photo & Video 167\n", + "Social Networking 143\n", + "Education 132\n", + "Shopping 121\n", + "Utilities 109\n", + "Lifestyle 94\n", + "Finance 84\n", + "Sports 79\n", + "Health & Fitness 76\n", + "Music 67\n", + "Book 66\n", + "Productivity 62\n", + "News 58\n", + "Travel 56\n", + "Food & Drink 43\n", + "Weather 31\n", + "Navigation 20\n", + "Reference 20\n", + "Business 20\n", + "Catalogs 9\n", + "Medical 8\n", + "Name: prime_genre, dtype: int64\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Games 2257\n", + "Entertainment 334\n", + "Photo & Video 167\n", + "Social Networking 143\n", + "Education 132\n", + "Shopping 121\n", + "Utilities 109\n", + "Lifestyle 94\n", + "Finance 84\n", + "Sports 79\n", + "Health & Fitness 76\n", + "Music 67\n", + "Book 66\n", + "Productivity 62\n", + "News 58\n", + "Travel 56\n", + "Food & Drink 43\n", + "Weather 31\n", + "Business 20\n", + "Reference 20\n", + "Navigation 20\n", + "Catalogs 9\n", + "Medical 8\n", + "Name: prime_genre, dtype: int64" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "free_apps = data[data['price'] == 0.0]\n", + "\n", + "free_apps_number = free_apps['prime_genre'].value_counts()\n", + "free_apps_number" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 14. Now you can calculate the proportion of the free apps in each genre based on the value counts you obtained in the previous two steps. \n", + "\n", + "Challenge yourself by achieving that with one line of code. The output should look like:\n", + "\n", + "```\n", + "Shopping 0.991803\n", + "Catalogs 0.900000\n", + "Social Networking 0.856287\n", + "Finance 0.807692\n", + "News 0.773333\n", + "Sports 0.692982\n", + "Travel 0.691358\n", + "Food & Drink 0.682540\n", + "Lifestyle 0.652778\n", + "Entertainment 0.624299\n", + "Book 0.589286\n", + "Games 0.584412\n", + "Music 0.485507\n", + "Photo & Video 0.478510\n", + "Utilities 0.439516\n", + "Navigation 0.434783\n", + "Weather 0.430556\n", + "Health & Fitness 0.422222\n", + "Business 0.350877\n", + "Productivity 0.348315\n", + "Medical 0.347826\n", + "Reference 0.312500\n", + "Education 0.291391\n", + "Name: prime_genre, dtype: float64\n", + "```\n", + "\n", + "The numbers are interesting, aren't they?" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Book 0.589286\n", + "Business 0.350877\n", + "Catalogs 0.900000\n", + "Education 0.291391\n", + "Entertainment 0.624299\n", + "Finance 0.807692\n", + "Food & Drink 0.682540\n", + "Games 0.584412\n", + "Health & Fitness 0.422222\n", + "Lifestyle 0.652778\n", + "Medical 0.347826\n", + "Music 0.485507\n", + "Navigation 0.434783\n", + "News 0.773333\n", + "Photo & Video 0.478510\n", + "Productivity 0.348315\n", + "Reference 0.312500\n", + "Shopping 0.991803\n", + "Social Networking 0.856287\n", + "Sports 0.692982\n", + "Travel 0.691358\n", + "Utilities 0.439516\n", + "Weather 0.430556\n", + "Name: prime_genre, dtype: float64" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "free_apps_number = free_apps['prime_genre'].value_counts()\n", + "count_of_genres = data['prime_genre'].value_counts()\n", + "\n", + "proportion_of_free_apps = free_apps_number / count_of_genres\n", + "proportion_of_free_apps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 15. If a developer tries to make money by developing and selling Apple Store apps, in which genre should s/he develop the apps? Please assume all apps cost the same amount of time and expense to develop.\n", + "\n", + "We will leave this question to you. There are several way to solve it. Ideally your output should look like below:\n", + "\n", + "```\n", + " average_price genre\n", + "21 8.776087 Medical\n", + "11 5.116316 Business\n", + "4 4.836875 Reference\n", + "6 4.835435 Music\n", + "1 4.330562 Productivity\n", + "15 4.124783 Navigation\n", + "16 4.028234 Education\n", + "12 1.916444 Health & Fitness\n", + "20 1.790536 Book\n", + "7 1.647621 Utilities\n", + "2 1.605417 Weather\n", + "18 1.552381 Food & Drink\n", + "14 1.473295 Photo & Video\n", + "0 1.432923 Games\n", + "8 1.120370 Travel\n", + "10 0.953070 Sports\n", + "13 0.889701 Entertainment\n", + "17 0.885417 Lifestyle\n", + "22 0.799000 Catalogs\n", + "19 0.517733 News\n", + "5 0.421154 Finance\n", + "9 0.339880 Social Networking\n", + "3 0.016311 Shopping\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "prime_genre\n", + "Book 1.790536\n", + "Business 5.116316\n", + "Catalogs 0.799000\n", + "Education 4.028234\n", + "Entertainment 0.889701\n", + "Finance 0.421154\n", + "Food & Drink 1.552381\n", + "Games 1.432923\n", + "Health & Fitness 1.916444\n", + "Lifestyle 0.885417\n", + "Medical 8.776087\n", + "Music 4.835435\n", + "Navigation 4.124783\n", + "News 0.517733\n", + "Photo & Video 1.473295\n", + "Productivity 4.330562\n", + "Reference 4.836875\n", + "Shopping 0.016311\n", + "Social Networking 0.339880\n", + "Sports 0.953070\n", + "Travel 1.120370\n", + "Utilities 1.647621\n", + "Weather 1.605417\n", + "Name: price, dtype: float64" + ] + }, + "execution_count": 126, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "average_price = data.groupby('prime_genre')['price'].mean()\n", + "average_price" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge - Applying Functions to DataFrames\n", + "\n", + "#### Our next step is to use the apply function to a dataframe and transform all cells.\n", + "\n", + "To do this, we will load a dataset below and then write a function that will perform the transformation." + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "# Run this code:\n", + "\n", + "# The dataset below contains information about pollution from PM2.5 particles in Beijing \n", + "\n", + "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv\"\n", + "pm25 = pd.read_csv(url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look at the data using the head() function." + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Noyearmonthdayhourpm2.5DEWPTEMPPREScbwdIwsIsIr
012010110NaN-21-11.01021.0NW1.7900
122010111NaN-21-12.01020.0NW4.9200
232010112NaN-21-11.01019.0NW6.7100
342010113NaN-21-14.01019.0NW9.8400
452010114NaN-20-12.01018.0NW12.9700
\n", + "
" + ], + "text/plain": [ + " No year month day hour pm2.5 DEWP TEMP PRES cbwd Iws Is Ir\n", + "0 1 2010 1 1 0 NaN -21 -11.0 1021.0 NW 1.79 0 0\n", + "1 2 2010 1 1 1 NaN -21 -12.0 1020.0 NW 4.92 0 0\n", + "2 3 2010 1 1 2 NaN -21 -11.0 1019.0 NW 6.71 0 0\n", + "3 4 2010 1 1 3 NaN -21 -14.0 1019.0 NW 9.84 0 0\n", + "4 5 2010 1 1 4 NaN -20 -12.0 1018.0 NW 12.97 0 0" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pm25.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next step is to create a function that divides a cell by 24 to produce an hourly figure. Write the function below." + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def hourly(x):\n", + " '''\n", + " Input: A numerical value\n", + " Output: The value divided by 24\n", + " \n", + " Example:\n", + " Input: 48\n", + " Output: 2.0\n", + " '''" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0.041667\n", + "1 0.083333\n", + "2 0.125000\n", + "3 0.166667\n", + "dtype: float64" + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def hourly(x):\n", + " return x.apply(lambda cell: cell / 24)\n", + "\n", + "output = hourly(data)\n", + "output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Apply this function to the columns Iws, Is, and Ir. Store this new dataframe in the variable pm25_hourly." + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IwsIsIr
00.0745830.00.0
10.2050000.00.0
20.2795830.00.0
30.4100000.00.0
40.5404170.00.0
............
438199.6654170.00.0
438209.9075000.00.0
4382110.1125000.00.0
4382210.2800000.00.0
4382310.4104170.00.0
\n", + "

43824 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Iws Is Ir\n", + "0 0.074583 0.0 0.0\n", + "1 0.205000 0.0 0.0\n", + "2 0.279583 0.0 0.0\n", + "3 0.410000 0.0 0.0\n", + "4 0.540417 0.0 0.0\n", + "... ... ... ...\n", + "43819 9.665417 0.0 0.0\n", + "43820 9.907500 0.0 0.0\n", + "43821 10.112500 0.0 0.0\n", + "43822 10.280000 0.0 0.0\n", + "43823 10.410417 0.0 0.0\n", + "\n", + "[43824 rows x 3 columns]" + ] + }, + "execution_count": 147, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pm25_hourly = pm25[['Iws', 'Is', 'Ir']].apply(hourly)\n", + "pm25_hourly" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Our last challenge will be to create an aggregate function and apply it to a select group of columns in our dataframe.\n", + "\n", + "Write a function that returns the standard deviation of a column divided by the length of a column minus 1. Since we are using pandas, do not use the `len()` function. One alternative is to use `count()`. Also, use the numpy version of standard deviation." + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [ + { + "ename": "IndentationError", + "evalue": "unexpected indent (3843731183.py, line 2)", + "output_type": "error", + "traceback": [ + "\u001b[1;36m Cell \u001b[1;32mIn[162], line 2\u001b[1;36m\u001b[0m\n\u001b[1;33m '''\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mIndentationError\u001b[0m\u001b[1;31m:\u001b[0m unexpected indent\n" + ] + } + ], + "source": [ + "# def sample_sd(x):\n", + " '''\n", + " Input: A Pandas series of values\n", + " Output: the standard deviation divided by the number of elements in the series\n", + " \n", + " Example:\n", + " Input: pd.Series([1,2,3,4])\n", + " Output: 0.3726779962\n", + " '''" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.4303314829119352\n" + ] + } + ], + "source": [ + "def sample_sd(data):\n", + " number_count = data.count() - 1 \n", + " standard_deviation = np.std(data, ddof=1) \n", + " result = standard_deviation / number_count\n", + " return result\n", + "\n", + "data = pd.Series([12, 13, 14, 15])\n", + "output = sample_sd(data)\n", + "print(output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}