diff --git a/python_linear_regression/data/healthcare_investments_and_hospital_stay.csv b/python_linear_regression/data/healthcare_investments_and_hospital_stay.csv new file mode 100644 index 000000000..95d47f0b0 --- /dev/null +++ b/python_linear_regression/data/healthcare_investments_and_hospital_stay.csv @@ -0,0 +1,519 @@ +Location,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds +AUS,1992,6.6,1.43,16.71,1.43 +AUS,1994,6.4,2.36,18.48,2.36 +AUS,1995,6.5,2.89,20.55,2.89 +AUS,1996,6.4,2.96,21.95,2.96 +AUS,1997,6.2,3.53,23.34,3.53 +AUS,1998,6.1,4.51,24.18,4.51 +AUS,1999,6.2,6.01,25.52,6.01 +AUS,2000,6.1,3.52,26.28,3.52 +AUS,2001,6.2,3.79,29.05,3.79 +AUS,2002,6.2,3.74,34.37,3.74 +AUS,2003,6.1,3.7,40.57,3.7 +AUS,2004,6.1,3.76,45.65,3.76 +AUS,2005,6.0,4.26,51.54,4.26 +AUS,2006,5.9,4.89,56.72,4.89 +AUS,2009,5.1,5.72,39.14,5.72 +AUS,2010,5.0,5.67,43.07,5.67 +AUS,2011,4.9,5.6,44.32,5.6 +AUS,2012,4.8,5.5,50.5,5.5 +AUS,2013,4.7,13.84,53.66,13.84 +AUS,2014,4.7,14.65,56.06,14.65 +AUS,2015,4.2,14.49,59.54,14.49 +AUS,2016,4.2,14.3,63.0,14.3 +AUS,2017,4.1,14.15,64.34,14.15 +AUT,1996,9.5,7.54,24.25,7.54 +AUT,1997,8.3,8.53,25.23,8.53 +AUT,1998,8.2,8.52,26.08,8.52 +AUT,1999,7.8,11.01,26.02,11.01 +AUT,2000,7.6,10.98,26.09,10.98 +AUT,2001,7.4,11.69,26.61,11.69 +AUT,2002,7.3,13.36,27.1,13.36 +AUT,2003,7.2,13.54,27.21,13.54 +AUT,2004,7.2,15.91,29.25,15.91 +AUT,2005,6.9,16.16,29.66,16.16 +AUT,2006,6.9,16.81,29.87,16.81 +AUT,2007,6.8,17.72,30.02,17.72 +AUT,2008,6.8,18.03,29.68,18.03 +AUT,2009,6.7,18.46,29.36,18.46 +AUT,2010,6.6,18.65,29.89,18.65 +AUT,2011,6.5,18.71,29.55,18.71 +AUT,2012,6.5,19.1,29.77,19.1 +AUT,2013,6.5,19.22,29.6,19.22 +AUT,2014,6.5,19.66,29.37,19.66 +AUT,2015,6.5,20.71,28.93,20.71 +AUT,2016,6.4,22.43,29.07,22.43 +AUT,2017,6.4,22.96,28.64,22.96 +AUT,2018,6.3,23.53,28.84,23.53 +BEL,2003,8.0,6.84,10.5,6.84 +BEL,2004,7.9,7.0,11.23,7.0 +BEL,2005,7.9,6.97,12.79,6.97 +BEL,2006,7.8,7.11,12.51,7.11 +BEL,2007,7.7,7.53,13.08,7.53 +BEL,2008,7.4,10.36,13.91,10.36 +BEL,2009,7.2,10.65,14.26,10.65 +BEL,2010,7.2,10.65,13.95,10.65 +BEL,2011,7.1,10.69,13.77,10.69 +BEL,2012,7.0,10.62,15.04,10.62 +BEL,2013,6.9,10.84,22.94,10.84 +BEL,2014,6.9,11.78,21.77,11.78 +BEL,2015,6.8,11.71,23.59,11.71 +BEL,2016,6.7,11.65,23.92,11.65 +BEL,2017,6.6,11.6,23.82,11.6 +BEL,2018,6.6,11.64,23.89,11.64 +CAN,1990,10.2,0.69,7.15,0.69 +CAN,1991,10.0,0.78,7.13,0.78 +CAN,1992,9.9,0.99,7.33,0.99 +CAN,1993,9.8,1.05,7.53,1.05 +CAN,1994,7.4,1.21,7.69,1.21 +CAN,1995,7.2,1.37,7.99,1.37 +CAN,1997,7.0,1.84,8.19,1.84 +CAN,2001,7.3,4.19,9.77,4.19 +CAN,2003,7.3,4.71,10.27,4.71 +CAN,2004,7.3,4.92,10.68,4.92 +CAN,2005,7.2,5.74,11.57,5.74 +CAN,2006,7.4,6.17,12.04,6.17 +CAN,2007,7.5,6.75,12.74,6.75 +CAN,2009,7.7,7.91,13.8,7.91 +CAN,2010,7.7,8.26,14.23,8.26 +CAN,2011,7.6,8.53,14.62,8.53 +CAN,2012,7.6,8.87,14.69,8.87 +CAN,2013,7.5,8.89,14.77,8.89 +CAN,2015,7.4,9.52,15.07,9.52 +CAN,2017,7.4,10.02,15.35,10.02 +CZE,1991,11.9,0.19,2.13,0.19 +CZE,1992,11.6,0.39,4.65,0.39 +CZE,1993,11.2,0.58,5.71,0.58 +CZE,1994,10.8,0.68,6.19,0.68 +CZE,1995,10.2,0.97,6.68,0.97 +CZE,2000,7.9,1.66,9.65,1.66 +CZE,2005,7.9,3.13,12.34,3.13 +CZE,2007,7.0,4.37,12.91,4.37 +CZE,2008,6.7,5.01,13.39,5.01 +CZE,2009,6.7,5.74,14.17,5.74 +CZE,2010,6.6,6.3,14.51,6.3 +CZE,2011,6.4,6.86,14.77,6.86 +CZE,2012,6.2,6.95,15.03,6.95 +CZE,2013,6.0,7.42,15.03,7.42 +CZE,2014,6.0,7.41,15.11,7.41 +CZE,2015,5.9,8.34,16.12,8.34 +CZE,2016,5.9,8.52,15.52,8.52 +CZE,2017,5.8,9.44,15.76,9.44 +CZE,2018,5.8,10.35,16.09,10.35 +DNK,2000,3.8,5.43,11.42,5.43 +DNK,2002,3.7,8.56,13.77,8.56 +DNK,2003,3.6,9.09,14.47,9.09 +DNK,2004,3.4,10.18,14.43,10.18 +FIN,1990,7.0,1.8,9.83,1.8 +FIN,1991,7.0,2.19,10.17,2.19 +FIN,1992,6.1,2.38,10.51,2.38 +FIN,1993,5.7,2.76,11.25,2.76 +FIN,1994,5.6,3.34,11.79,3.34 +FIN,1995,5.5,4.31,11.75,4.31 +FIN,1996,6.0,5.66,12.49,5.66 +FIN,1997,5.9,6.61,12.45,6.61 +FIN,1998,5.9,8.34,12.22,8.34 +FIN,1999,5.8,9.1,12.78,9.1 +FIN,2000,6.9,9.85,13.52,9.85 +FIN,2001,7.0,10.99,13.69,10.99 +FIN,2002,7.1,12.5,13.27,12.5 +FIN,2003,7.1,13.04,14.0,13.04 +FIN,2004,7.1,13.96,14.15,13.96 +FIN,2005,7.1,14.68,14.68,14.68 +FIN,2006,7.2,15.19,14.81,15.19 +FIN,2007,7.2,15.32,16.45,15.32 +FIN,2009,7.0,15.73,20.42,15.73 +FIN,2010,7.0,18.65,21.07,18.65 +FIN,2011,6.9,20.23,21.34,20.23 +FIN,2012,6.9,21.61,21.8,21.61 +FIN,2013,6.8,22.06,21.7,22.06 +FIN,2014,6.7,23.25,21.42,23.25 +FIN,2015,6.6,25.91,21.53,25.91 +FIN,2016,6.5,25.48,24.2,25.48 +FIN,2017,6.4,27.05,24.51,27.05 +FIN,2018,6.4,27.38,16.5,27.38 +FRA,1998,5.8,1.18,6.64,1.18 +FRA,1999,5.5,1.51,7.24,1.51 +FRA,2000,5.6,1.65,7.01,1.65 +FRA,2001,5.7,1.83,7.37,1.83 +FRA,2002,5.7,2.4,7.62,2.4 +FRA,2003,6.1,3.17,8.07,3.17 +FRA,2004,6.0,3.85,8.78,3.85 +FRA,2005,5.9,4.78,10.02,4.78 +FRA,2006,5.9,5.19,10.37,5.19 +FRA,2007,5.9,5.48,10.32,5.48 +FRA,2008,5.8,6.06,10.84,6.06 +FRA,2009,5.7,6.43,11.08,6.43 +FRA,2010,5.8,6.96,11.82,6.96 +FRA,2011,5.7,7.51,12.53,7.51 +FRA,2012,5.7,8.65,13.49,8.65 +FRA,2013,5.6,9.4,14.49,9.4 +FRA,2014,5.6,10.86,15.32,10.86 +FRA,2015,5.6,12.56,16.57,12.56 +FRA,2016,5.5,13.55,16.95,13.55 +FRA,2017,5.4,14.21,17.36,14.21 +FRA,2018,5.4,14.77,17.68,14.77 +DEU,2000,10.1,14.32,24.61,14.32 +DEU,2001,9.8,15.96,25.19,15.96 +DEU,2002,9.6,17.51,27.06,17.51 +DEU,2003,9.3,18.48,27.55,18.48 +DEU,2004,8.9,18.97,28.71,18.97 +DEU,2005,8.8,19.89,29.51,19.89 +DEU,2006,8.7,21.39,29.12,21.39 +DEU,2007,8.5,22.43,29.73,22.43 +DEU,2008,8.3,23.6,31.15,23.6 +DEU,2009,8.2,25.15,31.24,25.15 +DEU,2010,8.1,27.04,32.32,27.04 +DEU,2011,7.9,28.86,33.48,28.86 +DEU,2012,7.8,28.66,34.01,28.66 +DEU,2013,7.7,28.92,33.72,28.92 +DEU,2014,7.6,30.5,35.34,30.5 +DEU,2015,7.6,33.63,35.09,33.63 +DEU,2016,7.5,34.49,35.17,34.49 +DEU,2017,7.5,34.71,35.13,34.71 +GRC,2005,5.6,13.38,25.48,13.38 +GRC,2006,5.8,16.51,26.68,16.51 +GRC,2007,5.4,18.1,29.33,18.1 +GRC,2008,5.4,19.86,31.05,19.86 +GRC,2009,5.3,22.06,31.24,22.06 +GRC,2010,5.3,22.93,32.73,22.93 +GRC,2011,5.4,22.42,33.14,22.42 +GRC,2012,5.2,21.91,33.41,21.91 +GRC,2013,5.6,22.07,33.65,22.07 +GRC,2014,5.6,22.86,34.61,22.86 +HUN,1990,9.9,0.1,1.93,0.1 +HUN,1991,9.7,0.29,2.99,0.29 +HUN,1992,9.5,0.29,3.09,0.29 +HUN,1993,9.5,0.39,3.86,0.39 +HUN,1994,9.8,0.77,4.16,0.77 +HUN,1995,9.2,0.97,4.55,0.97 +HUN,1996,8.6,1.36,4.95,1.36 +HUN,1997,8.2,1.36,4.57,1.36 +HUN,1998,7.8,1.46,4.97,1.46 +HUN,1999,7.5,1.47,5.08,1.47 +HUN,2000,7.1,1.76,5.68,1.76 +HUN,2001,7.0,1.96,5.99,1.96 +HUN,2002,6.9,2.26,6.3,2.26 +HUN,2003,6.7,2.57,6.52,2.57 +HUN,2004,6.7,2.57,6.83,2.57 +HUN,2005,6.5,2.58,7.14,2.58 +HUN,2006,6.4,2.58,7.25,2.58 +HUN,2007,6.0,2.78,7.26,2.78 +HUN,2008,6.0,2.79,7.07,2.79 +HUN,2009,5.8,2.79,7.18,2.79 +HUN,2010,5.8,3.0,7.3,3.0 +HUN,2011,5.8,3.01,7.32,3.01 +HUN,2012,5.8,2.82,7.66,2.82 +HUN,2013,5.7,3.03,7.88,3.03 +HUN,2014,5.6,3.14,8.31,3.14 +HUN,2015,5.5,3.56,8.43,3.56 +HUN,2016,5.5,3.97,8.86,3.97 +HUN,2017,5.5,4.7,9.19,4.7 +HUN,2018,5.4,4.91,9.41,4.91 +IRL,2006,6.3,7.95,12.63,7.95 +IRL,2007,6.1,8.41,14.09,8.41 +IRL,2008,6.2,8.91,14.26,8.91 +IRL,2009,6.1,11.69,14.99,11.69 +IRL,2010,6.0,12.28,15.35,12.28 +IRL,2011,5.9,13.1,15.72,13.1 +IRL,2012,5.9,12.39,16.74,12.39 +IRL,2013,5.7,13.19,17.73,13.19 +IRL,2014,5.6,13.31,16.53,13.31 +IRL,2015,5.8,14.04,17.65,14.04 +IRL,2016,5.8,14.72,17.24,14.72 +IRL,2017,5.9,15.18,19.14,15.18 +IRL,2018,5.9,16.03,20.34,16.03 +ITA,1997,7.3,4.11,14.8,4.11 +ITA,1998,7.2,5.82,18.01,5.82 +ITA,1999,7.0,6.25,18.99,6.25 +ITA,2000,7.0,7.76,21.13,7.76 +ITA,2001,7.0,9.07,23.01,9.07 +ITA,2002,6.7,10.85,24.05,10.85 +ITA,2003,6.7,11.9,23.92,11.9 +ITA,2004,6.7,14.09,26.23,14.09 +ITA,2005,6.7,15.01,27.82,15.01 +ITA,2006,6.7,16.96,29.29,16.96 +ITA,2007,6.7,18.77,30.55,18.77 +ITA,2008,6.8,20.06,30.96,20.06 +ITA,2009,6.7,21.59,31.85,21.59 +ITA,2010,6.7,22.47,32.17,22.47 +ITA,2011,6.8,24.17,32.62,24.17 +ITA,2012,6.8,24.62,33.29,24.62 +ITA,2013,6.8,25.2,33.1,25.2 +ITA,2014,6.8,26.19,32.9,26.19 +ITA,2015,6.9,28.24,33.31,28.24 +ITA,2016,6.9,28.4,34.29,28.4 +ITA,2017,6.9,28.66,34.57,28.66 +ITA,2018,7.0,28.73,35.12,28.73 +JPN,1996,32.7,18.75,74.7,18.75 +JPN,1999,27.2,23.19,84.41,23.19 +JPN,2002,22.2,35.32,92.62,35.32 +JPN,2008,18.8,42.96,96.97,42.96 +JPN,2011,17.9,46.86,101.25,46.86 +JPN,2014,16.9,51.69,107.17,51.69 +JPN,2017,16.2,55.21,111.49,55.21 +KOR,1993,11.0,1.81,12.22,1.81 +KOR,1994,11.0,2.87,13.69,2.87 +KOR,1995,11.0,3.86,15.5,3.86 +KOR,1996,11.0,4.7,20.12,4.7 +KOR,1997,11.0,5.14,21.02,5.14 +KOR,2000,11.0,5.4,28.38,5.4 +KOR,2001,11.0,6.8,27.3,6.8 +KOR,2002,11.0,7.85,30.94,7.85 +KOR,2003,10.6,8.98,31.86,8.98 +KOR,2010,10.0,19.88,35.17,19.88 +KOR,2011,10.1,21.27,35.79,21.27 +KOR,2012,9.2,23.37,36.93,23.37 +KOR,2013,8.9,24.35,37.5,24.35 +KOR,2014,8.0,25.5,36.85,25.5 +KOR,2015,7.9,26.27,37.03,26.27 +KOR,2016,7.6,27.81,37.8,27.81 +KOR,2017,7.6,29.08,38.18,29.08 +KOR,2018,7.5,30.08,38.56,30.08 +LUX,2002,7.5,4.48,24.65,4.48 +LUX,2003,7.4,11.07,26.57,11.07 +LUX,2004,7.2,10.91,28.38,10.91 +LUX,2005,7.2,10.75,27.95,10.75 +LUX,2006,7.4,10.58,27.51,10.58 +LUX,2007,7.5,10.42,27.08,10.42 +LUX,2008,7.3,12.28,26.6,12.28 +LUX,2009,7.5,14.06,26.12,14.06 +LUX,2010,7.6,13.81,25.64,13.81 +LUX,2011,7.3,13.5,25.08,13.5 +LUX,2012,7.4,13.18,24.48,13.18 +LUX,2013,7.3,12.88,22.08,12.88 +LUX,2014,7.3,12.58,21.57,12.58 +LUX,2015,7.4,12.29,17.56,12.29 +LUX,2016,7.4,12.0,17.14,12.0 +LUX,2017,7.4,11.74,16.77,11.74 +LUX,2018,7.6,11.51,16.45,11.51 +NLD,1990,11.2,0.87,7.29,0.87 +NLD,1992,10.6,1.78,7.24,1.78 +NLD,1993,10.4,2.49,9.03,2.49 +NLD,2004,7.5,6.2,7.12,6.2 +NLD,2005,7.2,6.56,8.21,6.56 +NLD,2006,6.6,7.83,8.38,7.83 +NLD,2007,6.2,7.63,7.81,7.63 +NLD,2008,6.0,10.4,10.22,10.4 +NLD,2009,5.6,10.95,11.25,10.95 +NLD,2010,5.6,12.22,12.34,12.22 +NLD,2011,6.5,12.88,12.52,12.88 +NLD,2012,6.4,11.82,10.92,11.82 +NLD,2013,6.7,11.49,11.54,11.49 +NLD,2014,6.7,12.87,13.34,12.87 +NLD,2015,5.0,12.51,13.75,12.51 +NLD,2016,5.0,12.8,13.04,12.8 +NLD,2017,5.0,13.02,13.48,13.02 +NLD,2018,5.1,13.06,14.22,13.06 +NZL,2003,5.2,3.72,11.42,3.72 +NZL,2007,5.9,8.76,12.31,8.76 +NZL,2008,6.2,9.62,12.44,9.62 +NZL,2009,6.1,9.76,14.64,9.76 +NZL,2010,6.1,10.57,15.63,10.57 +NZL,2011,6.2,11.18,15.51,11.18 +NZL,2012,6.0,11.12,15.43,11.12 +NZL,2013,5.6,11.26,16.66,11.26 +NZL,2015,5.4,13.3,17.88,13.3 +NZL,2016,5.1,13.89,17.96,13.89 +NZL,2017,5.0,13.64,16.79,13.64 +POL,2005,7.9,2.02,7.94,2.02 +POL,2006,7.6,1.94,9.23,1.94 +POL,2007,7.4,2.7,9.65,2.7 +POL,2008,7.5,2.94,10.86,2.94 +POL,2009,7.4,3.7,12.4,3.7 +POL,2010,7.3,4.71,14.38,4.71 +POL,2011,7.1,4.83,13.61,4.83 +POL,2012,6.8,5.49,15.4,5.49 +POL,2013,6.7,6.78,17.09,6.78 +POL,2014,6.6,6.6,15.63,6.6 +POL,2015,6.9,7.63,17.16,7.63 +POL,2016,6.7,7.87,17.33,7.87 +POL,2017,6.6,7.93,16.88,7.93 +POL,2018,6.5,9.22,18.14,9.22 +PRT,2006,8.6,5.8,25.94,5.8 +PRT,2007,8.4,8.92,26.18,8.92 +PRT,2008,8.3,9.28,27.56,9.28 +SVK,2003,7.4,2.05,9.12,2.05 +SVK,2004,7.3,3.72,10.24,3.72 +SVK,2005,7.3,4.28,11.35,4.28 +SVK,2006,7.2,4.47,12.28,4.47 +SVK,2007,7.0,5.77,13.77,5.77 +SVK,2008,6.9,6.13,13.76,6.13 +SVK,2009,6.7,6.13,13.37,6.13 +SVK,2010,6.6,6.86,14.1,6.86 +SVK,2011,6.3,7.04,15.0,7.04 +SVK,2012,6.2,6.29,15.53,6.29 +SVK,2013,6.2,6.65,15.33,6.65 +SVK,2014,7.0,8.3,17.35,8.3 +SVK,2015,6.9,8.85,17.88,8.85 +SVK,2016,6.8,9.02,17.31,9.02 +SVK,2017,6.8,9.56,17.28,9.56 +SVK,2018,6.7,9.55,18.36,9.55 +ESP,2010,6.4,11.98,15.95,11.98 +ESP,2011,6.2,13.76,16.64,13.76 +ESP,2012,6.1,14.77,17.19,14.77 +ESP,2013,6.1,15.34,17.59,15.34 +ESP,2014,6.0,15.51,17.6,15.51 +ESP,2015,6.1,15.85,18.02,15.85 +ESP,2016,6.0,16.09,18.31,16.09 +ESP,2017,6.0,16.38,18.65,16.38 +ESP,2018,6.0,17.2,19.12,17.2 +TUR,2002,5.8,0.88,4.89,0.88 +TUR,2003,5.7,1.48,5.63,1.48 +TUR,2004,5.6,2.2,6.6,2.2 +TUR,2005,5.3,2.91,7.44,2.91 +TUR,2006,5.1,4.47,8.56,4.47 +TUR,2007,4.4,5.84,9.62,5.84 +TUR,2008,4.1,7.91,10.68,7.91 +TUR,2009,4.1,8.68,11.63,8.68 +TUR,2010,4.0,9.27,12.36,9.27 +TUR,2011,3.9,9.55,13.12,9.55 +TUR,2012,3.9,9.58,13.53,9.58 +TUR,2013,3.9,9.86,13.89,9.86 +TUR,2014,4.0,9.81,13.88,9.81 +TUR,2015,3.9,10.15,14.31,10.15 +TUR,2016,4.0,10.55,14.53,10.55 +TUR,2017,4.1,11.01,14.77,11.01 +TUR,2018,4.1,11.24,14.88,11.24 +GBR,2001,7.7,6.21,6.88,6.21 +GBR,2002,7.5,4.99,7.29,4.99 +GBR,2003,7.3,4.54,6.91,4.54 +GBR,2004,7.1,5.0,7.02,5.0 +GBR,2005,6.9,5.4,7.45,5.4 +GBR,2006,6.6,5.62,7.53,5.62 +GBR,2008,6.3,5.5,7.26,5.5 +GBR,2010,6.1,6.55,7.92,6.55 +GBR,2011,6.0,6.96,8.48,6.96 +GBR,2012,6.0,7.16,9.09,7.16 +GBR,2013,6.0,7.2,9.3,7.2 +GBR,2014,6.0,7.23,9.46,7.23 +USA,1997,6.1,11.41,24.1,11.41 +USA,1999,5.9,13.19,25.09,13.19 +USA,2001,5.8,17.44,28.88,17.44 +USA,2003,5.7,19.32,29.26,19.32 +USA,2004,5.6,26.67,32.29,26.67 +USA,2006,5.6,26.58,34.02,26.58 +USA,2007,5.5,25.93,34.31,25.93 +USA,2012,5.4,34.46,43.89,34.46 +USA,2013,5.4,35.51,43.5,35.51 +USA,2014,5.5,38.12,41.05,38.12 +USA,2015,5.5,39.03,41.01,39.03 +USA,2016,5.5,36.74,41.88,36.74 +USA,2017,5.5,37.65,42.74,37.65 +EST,2005,6.0,2.21,7.38,2.21 +EST,2006,5.9,3.71,7.42,3.71 +EST,2007,5.9,5.22,11.19,5.22 +EST,2008,5.7,8.23,14.96,8.23 +EST,2009,5.6,7.49,14.99,7.49 +EST,2010,5.5,8.26,15.77,8.26 +EST,2011,5.5,9.79,16.57,9.79 +EST,2012,5.6,9.83,17.39,9.83 +EST,2013,6.0,11.38,18.97,11.38 +EST,2014,5.9,11.41,19.78,11.41 +EST,2015,6.0,12.16,16.72,12.16 +EST,2016,6.1,13.68,17.48,13.68 +EST,2017,6.1,13.66,18.22,13.66 +EST,2018,6.1,13.62,18.91,13.62 +ISR,2000,7.1,1.43,5.57,1.43 +ISR,2001,6.2,1.4,6.37,1.4 +ISR,2002,5.9,1.37,6.24,1.37 +ISR,2003,5.8,1.64,5.83,1.64 +ISR,2004,6.0,1.62,6.32,1.62 +ISR,2005,5.7,1.73,6.49,1.73 +ISR,2006,5.5,1.84,6.38,1.84 +ISR,2007,5.2,2.23,7.94,2.23 +ISR,2008,5.1,2.33,8.21,2.33 +ISR,2009,5.1,2.27,8.68,2.27 +ISR,2010,5.2,2.23,8.79,2.23 +ISR,2011,5.2,2.7,8.76,2.7 +ISR,2012,5.1,3.29,8.98,3.29 +ISR,2013,5.2,3.47,8.93,3.47 +ISR,2014,5.1,4.02,9.49,4.02 +ISR,2015,5.2,4.06,9.67,4.06 +ISR,2016,5.2,4.91,9.6,4.91 +ISR,2017,5.1,5.16,9.53,5.16 +ISR,2018,5.0,5.18,9.57,5.18 +RUS,1993,13.6,0.92,1.58,0.92 +RUS,1994,13.6,0.77,1.48,0.77 +RUS,1995,13.6,0.61,1.82,0.61 +RUS,1996,13.6,0.7,2.1,0.7 +RUS,1997,14.3,0.85,2.21,0.85 +RUS,1998,14.0,0.74,2.32,0.74 +RUS,1999,13.7,0.88,2.39,0.88 +RUS,2000,13.5,1.13,2.58,1.13 +RUS,2001,13.2,1.11,2.66,1.11 +RUS,2002,12.9,1.31,2.77,1.31 +RUS,2004,12.2,1.36,3.32,1.36 +RUS,2005,11.9,1.54,3.77,1.54 +RUS,2006,11.5,2.12,4.04,2.12 +RUS,2007,11.4,2.02,4.42,2.02 +RUS,2008,11.3,2.27,5.02,2.27 +RUS,2009,11.0,2.52,6.02,2.52 +RUS,2010,10.8,2.51,6.9,2.51 +RUS,2011,11.3,2.62,7.72,2.62 +RUS,2012,10.8,4.17,9.09,4.17 +RUS,2013,10.3,3.99,11.28,3.99 +RUS,2014,9.9,4.44,12.2,4.44 +RUS,2015,9.7,4.64,12.56,4.64 +RUS,2016,9.4,4.52,12.76,4.52 +RUS,2017,9.3,4.6,13.0,4.6 +RUS,2018,9.1,4.84,13.37,4.84 +SVN,2006,5.8,6.48,10.46,6.48 +SVN,2008,5.7,6.43,12.37,6.43 +SVN,2009,5.6,7.35,11.77,7.35 +SVN,2010,5.5,7.81,12.69,7.81 +SVN,2011,6.8,8.77,12.67,8.77 +SVN,2012,6.9,8.75,12.64,8.75 +SVN,2013,6.6,9.22,12.14,9.22 +SVN,2014,6.6,9.21,13.09,9.21 +SVN,2015,6.5,9.21,13.08,9.21 +SVN,2016,6.5,11.14,14.04,11.14 +SVN,2017,6.6,11.61,15.0,11.61 +SVN,2018,6.7,12.05,15.91,12.05 +ISL,2007,5.6,19.26,32.1,19.26 +ISL,2008,5.5,18.9,31.5,18.9 +ISL,2009,5.5,21.98,34.54,21.98 +ISL,2010,5.4,22.01,37.73,22.01 +ISL,2011,5.3,21.94,40.75,21.94 +ISL,2012,5.5,21.83,40.53,21.83 +ISL,2013,5.6,21.62,40.15,21.62 +ISL,2014,5.8,21.38,39.71,21.38 +ISL,2015,5.9,21.16,39.3,21.16 +ISL,2016,5.9,20.87,38.76,20.87 +ISL,2017,5.7,20.38,43.68,20.38 +ISL,2018,5.6,19.85,48.2,19.85 +LVA,2003,7.9,1.31,13.55,1.31 +LVA,2004,7.8,0.88,15.02,0.88 +LVA,2005,7.4,2.68,18.31,2.68 +LVA,2006,7.2,2.7,18.48,2.7 +LVA,2007,7.1,5.0,21.81,5.0 +LVA,2008,7.1,6.89,23.88,6.89 +LVA,2009,6.1,7.47,25.68,7.47 +LVA,2010,6.2,8.1,29.08,8.1 +LVA,2011,6.0,9.22,31.07,9.22 +LVA,2012,5.8,9.83,32.44,9.83 +LVA,2013,5.8,10.43,34.78,10.43 +LVA,2014,5.9,12.54,36.11,12.54 +LVA,2015,6.0,12.64,36.91,12.64 +LVA,2016,5.9,13.78,36.23,13.78 +LVA,2017,6.0,13.9,39.13,13.9 +LVA,2018,6.0,13.49,38.4,13.49 +LTU,2000,9.2,0.29,6.57,0.29 +LTU,2001,9.0,0.86,7.2,0.86 +LTU,2002,8.7,0.87,8.71,0.87 +LTU,2003,8.3,0.88,9.08,0.88 +LTU,2004,8.2,1.18,11.55,1.18 +LTU,2005,8.1,1.5,12.04,1.5 +LTU,2006,8.0,3.06,12.84,3.06 +LTU,2007,7.7,3.4,10.52,3.4 +LTU,2008,7.5,4.38,13.76,4.38 +LTU,2009,7.2,5.37,16.12,5.37 +LTU,2010,7.1,4.84,18.73,4.84 +LTU,2011,7.0,5.94,20.14,5.94 +LTU,2012,6.9,10.04,23.76,10.04 +LTU,2013,6.9,10.48,23.67,10.48 +LTU,2014,6.8,10.57,22.17,10.57 +LTU,2015,6.6,11.02,21.0,11.02 +LTU,2016,6.6,12.2,23.01,12.2 +LTU,2017,6.5,12.37,23.33,12.37 +LTU,2018,6.5,12.49,24.27,12.49 diff --git a/python_linear_regression/python_linear_regression.md b/python_linear_regression/python_linear_regression.md new file mode 100644 index 000000000..aa18d8dc0 --- /dev/null +++ b/python_linear_regression/python_linear_regression.md @@ -0,0 +1,557 @@ + + +# Python Lesson on Regression for Machine Learning + +@overview + + +## Summary of Key Concepts in Linear Regression + +- **Definition**: Linear regression is a statistical method used to model and analyze the relationships between a dependent variable and one or more independent variables. + +- **Applications**: Commonly used in machine learning to predict continuous outcomes. + +- **Practical Application**: + - Applying linear regression to real-world datasets, such as synthetic healthcare investments and the diabetes dataset. + +- **Evaluation and Beyond**: + - Recognize model limitations and explore further analysis, such as: + - Nonlinear relationships. + - Model assumptions. + - Feature selection and engineering. + - Regularization techniques (Ridge, Lasso). + - Advanced models (Random Forests, Gradient Boosting). + + +- Linear regression is a starting point for data analysis and machine learning. +- The foundation built here prepares you for advanced techniques and complex challenges. +- Success in data analysis involves understanding data, asking the right questions, and critically evaluating results. + + + + +## Python Implementation of Linear Regression + +To implement linear regression in Python using Scikit-learn, we can follow these steps: + + + +### 1. Import Libraries +**Description**: +This code block imports necessary libraries for data manipulation and machine learning tasks. Specifically, it imports NumPy for numerical operations, Pandas for data manipulation, and scikit-learn (sklearn) for machine learning functionalities. + +**Why this is important:**1 +Importing libraries is the first step in any data analysis or machine learning project. These libraries provide tools and functions to efficiently handle data, perform mathematical operations, and build machine learning models. + +* **numpy (np):** Provides tools for working with numerical arrays and mathematical operations. +* **pandas (pd):** Enables data manipulation and analysis with data structures like DataFrames. +* **sklearn:** A powerful machine learning library. We specifically use: + * `train_test_split`: Splits data into training (model building) and testing (model evaluation) sets. + * `StandardScaler`: Standardizes features to have zero mean and unit variance (often important for linear regression). + * `LinearRegression`: The core linear regression model. + + +```python +import numpy as np +import pandas as pd + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LinearRegression +``` +@Pyodide.eval + + +**Output:** +There's no output generated from this code block. It simply imports the required libraries for subsequent steps in the machine learning workflow. + + + +### 2. Load the data: + +**Description:** + +* `pd.read_csv("file")`: Reads data from a CSV file into a pandas DataFrame. +* `data.info()`: Gives a summary of the data such as column names, data types, and any missing values. + +**Why this is important:** +Loading the data is the initial step in any data analysis or machine learning task. It's essential to understand the structure of the data, such as the number of features and their data types, before proceeding with further analysis. + +```python @Pyodide.exec + +import pandas as pd +import io +from pyodide.http import open_url + +# URL of the CSV file +url = "https://raw.githubusercontent.com/arcus/education_modules/linear_regression/python_linear_regression/data/healthcare_investments_and_hospital_stay.csv" + +# Open and read the contents of the URL +url_contents = open_url(url) +text = url_contents.read() + +# Create a file-like object from the text content +file = io.StringIO(text) + +# Read the CSV data into a pandas DataFrame +data = pd.read_csv(file) + +# Analyze data and features +data.info() +``` + +**Output:** +After executing this code block, you will see a summary of the loaded data, including information about columns, data types, and non-null values. This helps them understand the dataset they will be working with. + + + + + +### 3. **The `onehot_encode` Function** + +**Description:** + + * This function handles categorical features (like "Location" in your data) by creating new columns where each column represents a unique category. The values are 1 if the data point belongs to that category and 0 otherwise. + +**Why this is important:** +One-hot encoding is crucial when dealing with categorical data in machine learning models. Many machine learning algorithms cannot directly handle categorical data, so encoding them into numerical values allows algorithms to operate on the data effectively. By creating binary columns for each category, we ensure that each category is treated equally, without imposing any ordinality or magnitude among them. + + +```python +def onehot_encode(df, column): + # Make a copy of the DataFrame to avoid modifying the original data + df = df.copy() + + # Use pandas get_dummies function to one-hot encode the specified column + dummies = pd.get_dummies(df[column]) + + # Concatenate the one-hot encoded columns with the original DataFrame + df = pd.concat([df, dummies], axis=1) + + # Drop the original categorical column since it's no longer needed + df = df.drop(column, axis=1) + + return df +``` +@Pyodide.eval + + + + + +### 4. Make Data Copy and One-Hot Encode + +**Description:** +The code creates a copy of the DataFrame df to ensure that the original data remains unchanged. It then applies one-hot encoding to the Location column using the onehot_encode function. + +**Why this is important:** +Creating a copy of the DataFrame is essential to prevent unintentional modifications to the original data, which could lead to unexpected results or loss of information. One-hot encoding is necessary to convert categorical variables, such as the Location column, into numerical format, which is required for many machine learning algorithms to operate effectively. + +* Creates a copy so we don't change the original data by accident. +* Applies one-hot encoding to the `Location` column. + +```python +# Make a copy of the DataFrame to avoid modifying the original data accidentally +df = df.copy() + +# Apply one-hot encoding to the 'Location' column +df = onehot_encode(df, column='Location') + +# Print the resulting DataFrame to observe the effect of one-hot encoding +print(df.head()) +``` +**Output:** +While this code snippet itself does not produce direct output, we can demonstrate its usage by applying it to a DataFrame and printing the resulting DataFrame to observe the effect of one-hot encoding. + + + + +### 5. Separate Target and Features +**Description:** + +* This code snippet separates the target variable (`Hospital_Stay`) from the features in the DataFrame df. +* The target variable (`y`) is what we want to predict, while the features (`X`) are the information we'll use to make the prediction. + +**Why this is important:** + +* Separating the target variable from the features is a crucial step in machine learning model training. +* The target variable is the variable we aim to predict, while the features are the input variables that influence the prediction. +* By separating them, we ensure that the model trains on the features to predict the target accurately. + + +```python +# Separate the target variable 'Hospital_Stay' from the features +y = df['Hospital_Stay'].copy() +X = df.drop('Hospital_Stay', axis=1).copy() + +# Print the target variable and features to verify the separation +print("Target variable (y):") +print(y.head()) +print("\nFeatures (X):") +print(X.head()) + +``` + +**Output:** +While this code snippet doesn't produce any visible output, we can verify the separation by printing the y (target variable) and X (features) variables. + + + +### 6. Split into Training and Testing Sets +**Description:** +The `train_test_split` function divides the dataset into training and testing sets. Here, 70% of the data is used for training `(X_train, y_train)`, and the remaining 30% is held back for testing `(X_test, y_test)`. + +**Why this is important:** +Splitting the data into training and testing sets is crucial in machine learning to assess the performance of the model. The training set is used to train the model, while the testing set is used to evaluate its performance on unseen data. This helps to detect overfitting and ensures that the model generalizes well to new data. + +* `random_state=123` ensures we get the same split each time for reproducibility. + + +```python +# Split the data into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123) + +# Print the shapes of the resulting training and testing sets +print("Training set - X shape:", X_train.shape, "y shape:", y_train.shape) +print("Testing set - X shape:", X_test.shape, "y shape:", y_test.shape) + +``` +**Output:** +While this code block doesn't produce any output directly, we can demonstrate its usage by applying it to our data and printing the shapes of the resulting training and testing sets to confirm the split. + + +### 7. Standardize Features +**Description:** + +* The code initializes a `StandardScaler` object, which will be used to standardize (or z-score normalize) the features. +* It then fits the scaler to the training data (`X_train`), calculating the mean and standard deviation of each feature in the training set. +* Finally, it scales both the training and testing data to have zero mean and unit variance using the fitted scaler. This ensures that both datasets are scaled in the same way. + +**Why this is important:** +Standardizing features is crucial, especially when working with algorithms that rely on distance metrics or gradient descent optimization, such as KNN, SVM, or logistic regression. By standardizing the features, we remove the mean and scale the data to unit variance, which can improve the convergence rate of optimization algorithms and prevent features with larger scales from dominating those with smaller scales. + +```python +# Initialize a StandardScaler object +scaler = StandardScaler() + +# Fit the scaler to the training data, calculating the mean and standard deviation of each feature +scaler.fit(X_train) + +# Scale both training and testing data to have zero mean and unit variance +X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns) +X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns) + +# Print the scaled training and testing data to observe the effect of standardization +print("Scaled Training Data:") +print(X_train.head()) +print("\nScaled Testing Data:") +print(X_test.head()) +``` + +**Output:** +While this code block doesn't produce any output directly, learners can observe the effect of standardization by printing the scaled `X_train` and `X_test` datasets after applying the scaler. + + + +### 8. Create and Train the Model +**Description:** + +* This code segment creates a linear regression object using the `LinearRegression` class from the scikit-learn library. +* It then fits the model to the training data, finding the best-fit line (or plane, in higher dimensions) by minimizing the difference between predicted and actual values in the training data. + +**Why this is important:** +Creating and training a model is the core of supervised machine learning. In this step, we instantiate a regression model and train it on our training data to learn the underlying patterns and relationships between the input features (`X`) and the target variable (`y`). This trained model will later be used to make predictions on new, unseen data. + +```python +# Create a Linear Regression model object +model = LinearRegression() + +# Fit the model to the training data +model.fit(X_train, y_train) +``` +@Pyodide.eval + + + +### 9. Make Predictions +**Description:** +This line applies the trained machine learning model (`model`) to the testing data (`X_test`) to make predictions about hospital stay durations. + +**Why this is important:** +Making predictions is the ultimate goal of any machine learning model. By applying the trained model to new, unseen data, we can obtain predictions that can be used for decision-making or further analysis. + +```python +# Make predictions using the trained model and the testing data +y_pred = model.predict(X_test) + +# Print the predicted hospital stay durations +print(y_pred) +``` +@Pyodide.eval + +**Output:** +While this line doesn't produce output directly, we can add a print statement to display the predictions generated by the model. + + + +### 10. Evaluate the Model +**Description:** + +The code calculates and prints two evaluation metrics for the regression model: + +* Mean Squared Error (MSE): A measure of how close the predictions are to the actual values on average. Lower values indicate better performance. +* R² Score: Indicates the proportion of variance in the target variable that is explained by the model. Ranges from 0 to 1, with 1 being the best possible score. + +**Why this is important:** +Evaluating the model's performance is crucial to understand how well it is generalizing to unseen data. The Mean Squared Error provides a quantitative measure of the model's prediction accuracy, while the R² Score gives insight into the goodness of fit of the model. + + +```python +mse = np.mean((y_pred - y_test)**2) +print("MSE:", mse) +print(" R^2 Score: {:.5f}".format(model.score(X_test, y_test))) +``` + +**Output:** +This code snippet produces output showing the calculated MSE and R² Score, providing insights into the model's performance. + + +### Code Overview and Tips +This is a basic example of how to implement linear regression in Python using Scikit-learn. There are many other ways to implement linear regression in Python, but this is a good starting point. + +Here are some additional tips for implementing linear regression in Python: + +- Make sure to scale the data before training the model. This will help to ensure that all features have equal importance in the model. +- Use a validation set to evaluate the model and tune the hyperparameters. This will help to prevent overfitting. +- Use regularization techniques, such as L1 or L2 regularization, to prevent overfitting. +- Interpret the coefficients of the linear regression model to understand the relationship between the predictor variables and the target variable. + + + + +## Review your knowledge + +Which function from Scikit-learn is used to split the dataset into training and testing sets? + + +A) data_splitter +B) train_test_split +C) train_validate_split +D) model_splitter + + +[( )] `data_splitter` +[(X)] `train_test_split` +[( )] `train_validate_split` +[( )] `model_splitter` +*** +