diff --git a/python_linear_regression/data/healthcare_investments_and_hospital_stay.csv b/python_linear_regression/data/healthcare_investments_and_hospital_stay.csv new file mode 100644 index 000000000..95d47f0b0 --- /dev/null +++ b/python_linear_regression/data/healthcare_investments_and_hospital_stay.csv @@ -0,0 +1,519 @@ +Location,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds +AUS,1992,6.6,1.43,16.71,1.43 +AUS,1994,6.4,2.36,18.48,2.36 +AUS,1995,6.5,2.89,20.55,2.89 +AUS,1996,6.4,2.96,21.95,2.96 +AUS,1997,6.2,3.53,23.34,3.53 +AUS,1998,6.1,4.51,24.18,4.51 +AUS,1999,6.2,6.01,25.52,6.01 +AUS,2000,6.1,3.52,26.28,3.52 +AUS,2001,6.2,3.79,29.05,3.79 +AUS,2002,6.2,3.74,34.37,3.74 +AUS,2003,6.1,3.7,40.57,3.7 +AUS,2004,6.1,3.76,45.65,3.76 +AUS,2005,6.0,4.26,51.54,4.26 +AUS,2006,5.9,4.89,56.72,4.89 +AUS,2009,5.1,5.72,39.14,5.72 +AUS,2010,5.0,5.67,43.07,5.67 +AUS,2011,4.9,5.6,44.32,5.6 +AUS,2012,4.8,5.5,50.5,5.5 +AUS,2013,4.7,13.84,53.66,13.84 +AUS,2014,4.7,14.65,56.06,14.65 +AUS,2015,4.2,14.49,59.54,14.49 +AUS,2016,4.2,14.3,63.0,14.3 +AUS,2017,4.1,14.15,64.34,14.15 +AUT,1996,9.5,7.54,24.25,7.54 +AUT,1997,8.3,8.53,25.23,8.53 +AUT,1998,8.2,8.52,26.08,8.52 +AUT,1999,7.8,11.01,26.02,11.01 +AUT,2000,7.6,10.98,26.09,10.98 +AUT,2001,7.4,11.69,26.61,11.69 +AUT,2002,7.3,13.36,27.1,13.36 +AUT,2003,7.2,13.54,27.21,13.54 +AUT,2004,7.2,15.91,29.25,15.91 +AUT,2005,6.9,16.16,29.66,16.16 +AUT,2006,6.9,16.81,29.87,16.81 +AUT,2007,6.8,17.72,30.02,17.72 +AUT,2008,6.8,18.03,29.68,18.03 +AUT,2009,6.7,18.46,29.36,18.46 +AUT,2010,6.6,18.65,29.89,18.65 +AUT,2011,6.5,18.71,29.55,18.71 +AUT,2012,6.5,19.1,29.77,19.1 +AUT,2013,6.5,19.22,29.6,19.22 +AUT,2014,6.5,19.66,29.37,19.66 +AUT,2015,6.5,20.71,28.93,20.71 +AUT,2016,6.4,22.43,29.07,22.43 +AUT,2017,6.4,22.96,28.64,22.96 +AUT,2018,6.3,23.53,28.84,23.53 +BEL,2003,8.0,6.84,10.5,6.84 +BEL,2004,7.9,7.0,11.23,7.0 +BEL,2005,7.9,6.97,12.79,6.97 +BEL,2006,7.8,7.11,12.51,7.11 +BEL,2007,7.7,7.53,13.08,7.53 +BEL,2008,7.4,10.36,13.91,10.36 +BEL,2009,7.2,10.65,14.26,10.65 +BEL,2010,7.2,10.65,13.95,10.65 +BEL,2011,7.1,10.69,13.77,10.69 +BEL,2012,7.0,10.62,15.04,10.62 +BEL,2013,6.9,10.84,22.94,10.84 +BEL,2014,6.9,11.78,21.77,11.78 +BEL,2015,6.8,11.71,23.59,11.71 +BEL,2016,6.7,11.65,23.92,11.65 +BEL,2017,6.6,11.6,23.82,11.6 +BEL,2018,6.6,11.64,23.89,11.64 +CAN,1990,10.2,0.69,7.15,0.69 +CAN,1991,10.0,0.78,7.13,0.78 +CAN,1992,9.9,0.99,7.33,0.99 +CAN,1993,9.8,1.05,7.53,1.05 +CAN,1994,7.4,1.21,7.69,1.21 +CAN,1995,7.2,1.37,7.99,1.37 +CAN,1997,7.0,1.84,8.19,1.84 +CAN,2001,7.3,4.19,9.77,4.19 +CAN,2003,7.3,4.71,10.27,4.71 +CAN,2004,7.3,4.92,10.68,4.92 +CAN,2005,7.2,5.74,11.57,5.74 +CAN,2006,7.4,6.17,12.04,6.17 +CAN,2007,7.5,6.75,12.74,6.75 +CAN,2009,7.7,7.91,13.8,7.91 +CAN,2010,7.7,8.26,14.23,8.26 +CAN,2011,7.6,8.53,14.62,8.53 +CAN,2012,7.6,8.87,14.69,8.87 +CAN,2013,7.5,8.89,14.77,8.89 +CAN,2015,7.4,9.52,15.07,9.52 +CAN,2017,7.4,10.02,15.35,10.02 +CZE,1991,11.9,0.19,2.13,0.19 +CZE,1992,11.6,0.39,4.65,0.39 +CZE,1993,11.2,0.58,5.71,0.58 +CZE,1994,10.8,0.68,6.19,0.68 +CZE,1995,10.2,0.97,6.68,0.97 +CZE,2000,7.9,1.66,9.65,1.66 +CZE,2005,7.9,3.13,12.34,3.13 +CZE,2007,7.0,4.37,12.91,4.37 +CZE,2008,6.7,5.01,13.39,5.01 +CZE,2009,6.7,5.74,14.17,5.74 +CZE,2010,6.6,6.3,14.51,6.3 +CZE,2011,6.4,6.86,14.77,6.86 +CZE,2012,6.2,6.95,15.03,6.95 +CZE,2013,6.0,7.42,15.03,7.42 +CZE,2014,6.0,7.41,15.11,7.41 +CZE,2015,5.9,8.34,16.12,8.34 +CZE,2016,5.9,8.52,15.52,8.52 +CZE,2017,5.8,9.44,15.76,9.44 +CZE,2018,5.8,10.35,16.09,10.35 +DNK,2000,3.8,5.43,11.42,5.43 +DNK,2002,3.7,8.56,13.77,8.56 +DNK,2003,3.6,9.09,14.47,9.09 +DNK,2004,3.4,10.18,14.43,10.18 +FIN,1990,7.0,1.8,9.83,1.8 +FIN,1991,7.0,2.19,10.17,2.19 +FIN,1992,6.1,2.38,10.51,2.38 +FIN,1993,5.7,2.76,11.25,2.76 +FIN,1994,5.6,3.34,11.79,3.34 +FIN,1995,5.5,4.31,11.75,4.31 +FIN,1996,6.0,5.66,12.49,5.66 +FIN,1997,5.9,6.61,12.45,6.61 +FIN,1998,5.9,8.34,12.22,8.34 +FIN,1999,5.8,9.1,12.78,9.1 +FIN,2000,6.9,9.85,13.52,9.85 +FIN,2001,7.0,10.99,13.69,10.99 +FIN,2002,7.1,12.5,13.27,12.5 +FIN,2003,7.1,13.04,14.0,13.04 +FIN,2004,7.1,13.96,14.15,13.96 +FIN,2005,7.1,14.68,14.68,14.68 +FIN,2006,7.2,15.19,14.81,15.19 +FIN,2007,7.2,15.32,16.45,15.32 +FIN,2009,7.0,15.73,20.42,15.73 +FIN,2010,7.0,18.65,21.07,18.65 +FIN,2011,6.9,20.23,21.34,20.23 +FIN,2012,6.9,21.61,21.8,21.61 +FIN,2013,6.8,22.06,21.7,22.06 +FIN,2014,6.7,23.25,21.42,23.25 +FIN,2015,6.6,25.91,21.53,25.91 +FIN,2016,6.5,25.48,24.2,25.48 +FIN,2017,6.4,27.05,24.51,27.05 +FIN,2018,6.4,27.38,16.5,27.38 +FRA,1998,5.8,1.18,6.64,1.18 +FRA,1999,5.5,1.51,7.24,1.51 +FRA,2000,5.6,1.65,7.01,1.65 +FRA,2001,5.7,1.83,7.37,1.83 +FRA,2002,5.7,2.4,7.62,2.4 +FRA,2003,6.1,3.17,8.07,3.17 +FRA,2004,6.0,3.85,8.78,3.85 +FRA,2005,5.9,4.78,10.02,4.78 +FRA,2006,5.9,5.19,10.37,5.19 +FRA,2007,5.9,5.48,10.32,5.48 +FRA,2008,5.8,6.06,10.84,6.06 +FRA,2009,5.7,6.43,11.08,6.43 +FRA,2010,5.8,6.96,11.82,6.96 +FRA,2011,5.7,7.51,12.53,7.51 +FRA,2012,5.7,8.65,13.49,8.65 +FRA,2013,5.6,9.4,14.49,9.4 +FRA,2014,5.6,10.86,15.32,10.86 +FRA,2015,5.6,12.56,16.57,12.56 +FRA,2016,5.5,13.55,16.95,13.55 +FRA,2017,5.4,14.21,17.36,14.21 +FRA,2018,5.4,14.77,17.68,14.77 +DEU,2000,10.1,14.32,24.61,14.32 +DEU,2001,9.8,15.96,25.19,15.96 +DEU,2002,9.6,17.51,27.06,17.51 +DEU,2003,9.3,18.48,27.55,18.48 +DEU,2004,8.9,18.97,28.71,18.97 +DEU,2005,8.8,19.89,29.51,19.89 +DEU,2006,8.7,21.39,29.12,21.39 +DEU,2007,8.5,22.43,29.73,22.43 +DEU,2008,8.3,23.6,31.15,23.6 +DEU,2009,8.2,25.15,31.24,25.15 +DEU,2010,8.1,27.04,32.32,27.04 +DEU,2011,7.9,28.86,33.48,28.86 +DEU,2012,7.8,28.66,34.01,28.66 +DEU,2013,7.7,28.92,33.72,28.92 +DEU,2014,7.6,30.5,35.34,30.5 +DEU,2015,7.6,33.63,35.09,33.63 +DEU,2016,7.5,34.49,35.17,34.49 +DEU,2017,7.5,34.71,35.13,34.71 +GRC,2005,5.6,13.38,25.48,13.38 +GRC,2006,5.8,16.51,26.68,16.51 +GRC,2007,5.4,18.1,29.33,18.1 +GRC,2008,5.4,19.86,31.05,19.86 +GRC,2009,5.3,22.06,31.24,22.06 +GRC,2010,5.3,22.93,32.73,22.93 +GRC,2011,5.4,22.42,33.14,22.42 +GRC,2012,5.2,21.91,33.41,21.91 +GRC,2013,5.6,22.07,33.65,22.07 +GRC,2014,5.6,22.86,34.61,22.86 +HUN,1990,9.9,0.1,1.93,0.1 +HUN,1991,9.7,0.29,2.99,0.29 +HUN,1992,9.5,0.29,3.09,0.29 +HUN,1993,9.5,0.39,3.86,0.39 +HUN,1994,9.8,0.77,4.16,0.77 +HUN,1995,9.2,0.97,4.55,0.97 +HUN,1996,8.6,1.36,4.95,1.36 +HUN,1997,8.2,1.36,4.57,1.36 +HUN,1998,7.8,1.46,4.97,1.46 +HUN,1999,7.5,1.47,5.08,1.47 +HUN,2000,7.1,1.76,5.68,1.76 +HUN,2001,7.0,1.96,5.99,1.96 +HUN,2002,6.9,2.26,6.3,2.26 +HUN,2003,6.7,2.57,6.52,2.57 +HUN,2004,6.7,2.57,6.83,2.57 +HUN,2005,6.5,2.58,7.14,2.58 +HUN,2006,6.4,2.58,7.25,2.58 +HUN,2007,6.0,2.78,7.26,2.78 +HUN,2008,6.0,2.79,7.07,2.79 +HUN,2009,5.8,2.79,7.18,2.79 +HUN,2010,5.8,3.0,7.3,3.0 +HUN,2011,5.8,3.01,7.32,3.01 +HUN,2012,5.8,2.82,7.66,2.82 +HUN,2013,5.7,3.03,7.88,3.03 +HUN,2014,5.6,3.14,8.31,3.14 +HUN,2015,5.5,3.56,8.43,3.56 +HUN,2016,5.5,3.97,8.86,3.97 +HUN,2017,5.5,4.7,9.19,4.7 +HUN,2018,5.4,4.91,9.41,4.91 +IRL,2006,6.3,7.95,12.63,7.95 +IRL,2007,6.1,8.41,14.09,8.41 +IRL,2008,6.2,8.91,14.26,8.91 +IRL,2009,6.1,11.69,14.99,11.69 +IRL,2010,6.0,12.28,15.35,12.28 +IRL,2011,5.9,13.1,15.72,13.1 +IRL,2012,5.9,12.39,16.74,12.39 +IRL,2013,5.7,13.19,17.73,13.19 +IRL,2014,5.6,13.31,16.53,13.31 +IRL,2015,5.8,14.04,17.65,14.04 +IRL,2016,5.8,14.72,17.24,14.72 +IRL,2017,5.9,15.18,19.14,15.18 +IRL,2018,5.9,16.03,20.34,16.03 +ITA,1997,7.3,4.11,14.8,4.11 +ITA,1998,7.2,5.82,18.01,5.82 +ITA,1999,7.0,6.25,18.99,6.25 +ITA,2000,7.0,7.76,21.13,7.76 +ITA,2001,7.0,9.07,23.01,9.07 +ITA,2002,6.7,10.85,24.05,10.85 +ITA,2003,6.7,11.9,23.92,11.9 +ITA,2004,6.7,14.09,26.23,14.09 +ITA,2005,6.7,15.01,27.82,15.01 +ITA,2006,6.7,16.96,29.29,16.96 +ITA,2007,6.7,18.77,30.55,18.77 +ITA,2008,6.8,20.06,30.96,20.06 +ITA,2009,6.7,21.59,31.85,21.59 +ITA,2010,6.7,22.47,32.17,22.47 +ITA,2011,6.8,24.17,32.62,24.17 +ITA,2012,6.8,24.62,33.29,24.62 +ITA,2013,6.8,25.2,33.1,25.2 +ITA,2014,6.8,26.19,32.9,26.19 +ITA,2015,6.9,28.24,33.31,28.24 +ITA,2016,6.9,28.4,34.29,28.4 +ITA,2017,6.9,28.66,34.57,28.66 +ITA,2018,7.0,28.73,35.12,28.73 +JPN,1996,32.7,18.75,74.7,18.75 +JPN,1999,27.2,23.19,84.41,23.19 +JPN,2002,22.2,35.32,92.62,35.32 +JPN,2008,18.8,42.96,96.97,42.96 +JPN,2011,17.9,46.86,101.25,46.86 +JPN,2014,16.9,51.69,107.17,51.69 +JPN,2017,16.2,55.21,111.49,55.21 +KOR,1993,11.0,1.81,12.22,1.81 +KOR,1994,11.0,2.87,13.69,2.87 +KOR,1995,11.0,3.86,15.5,3.86 +KOR,1996,11.0,4.7,20.12,4.7 +KOR,1997,11.0,5.14,21.02,5.14 +KOR,2000,11.0,5.4,28.38,5.4 +KOR,2001,11.0,6.8,27.3,6.8 +KOR,2002,11.0,7.85,30.94,7.85 +KOR,2003,10.6,8.98,31.86,8.98 +KOR,2010,10.0,19.88,35.17,19.88 +KOR,2011,10.1,21.27,35.79,21.27 +KOR,2012,9.2,23.37,36.93,23.37 +KOR,2013,8.9,24.35,37.5,24.35 +KOR,2014,8.0,25.5,36.85,25.5 +KOR,2015,7.9,26.27,37.03,26.27 +KOR,2016,7.6,27.81,37.8,27.81 +KOR,2017,7.6,29.08,38.18,29.08 +KOR,2018,7.5,30.08,38.56,30.08 +LUX,2002,7.5,4.48,24.65,4.48 +LUX,2003,7.4,11.07,26.57,11.07 +LUX,2004,7.2,10.91,28.38,10.91 +LUX,2005,7.2,10.75,27.95,10.75 +LUX,2006,7.4,10.58,27.51,10.58 +LUX,2007,7.5,10.42,27.08,10.42 +LUX,2008,7.3,12.28,26.6,12.28 +LUX,2009,7.5,14.06,26.12,14.06 +LUX,2010,7.6,13.81,25.64,13.81 +LUX,2011,7.3,13.5,25.08,13.5 +LUX,2012,7.4,13.18,24.48,13.18 +LUX,2013,7.3,12.88,22.08,12.88 +LUX,2014,7.3,12.58,21.57,12.58 +LUX,2015,7.4,12.29,17.56,12.29 +LUX,2016,7.4,12.0,17.14,12.0 +LUX,2017,7.4,11.74,16.77,11.74 +LUX,2018,7.6,11.51,16.45,11.51 +NLD,1990,11.2,0.87,7.29,0.87 +NLD,1992,10.6,1.78,7.24,1.78 +NLD,1993,10.4,2.49,9.03,2.49 +NLD,2004,7.5,6.2,7.12,6.2 +NLD,2005,7.2,6.56,8.21,6.56 +NLD,2006,6.6,7.83,8.38,7.83 +NLD,2007,6.2,7.63,7.81,7.63 +NLD,2008,6.0,10.4,10.22,10.4 +NLD,2009,5.6,10.95,11.25,10.95 +NLD,2010,5.6,12.22,12.34,12.22 +NLD,2011,6.5,12.88,12.52,12.88 +NLD,2012,6.4,11.82,10.92,11.82 +NLD,2013,6.7,11.49,11.54,11.49 +NLD,2014,6.7,12.87,13.34,12.87 +NLD,2015,5.0,12.51,13.75,12.51 +NLD,2016,5.0,12.8,13.04,12.8 +NLD,2017,5.0,13.02,13.48,13.02 +NLD,2018,5.1,13.06,14.22,13.06 +NZL,2003,5.2,3.72,11.42,3.72 +NZL,2007,5.9,8.76,12.31,8.76 +NZL,2008,6.2,9.62,12.44,9.62 +NZL,2009,6.1,9.76,14.64,9.76 +NZL,2010,6.1,10.57,15.63,10.57 +NZL,2011,6.2,11.18,15.51,11.18 +NZL,2012,6.0,11.12,15.43,11.12 +NZL,2013,5.6,11.26,16.66,11.26 +NZL,2015,5.4,13.3,17.88,13.3 +NZL,2016,5.1,13.89,17.96,13.89 +NZL,2017,5.0,13.64,16.79,13.64 +POL,2005,7.9,2.02,7.94,2.02 +POL,2006,7.6,1.94,9.23,1.94 +POL,2007,7.4,2.7,9.65,2.7 +POL,2008,7.5,2.94,10.86,2.94 +POL,2009,7.4,3.7,12.4,3.7 +POL,2010,7.3,4.71,14.38,4.71 +POL,2011,7.1,4.83,13.61,4.83 +POL,2012,6.8,5.49,15.4,5.49 +POL,2013,6.7,6.78,17.09,6.78 +POL,2014,6.6,6.6,15.63,6.6 +POL,2015,6.9,7.63,17.16,7.63 +POL,2016,6.7,7.87,17.33,7.87 +POL,2017,6.6,7.93,16.88,7.93 +POL,2018,6.5,9.22,18.14,9.22 +PRT,2006,8.6,5.8,25.94,5.8 +PRT,2007,8.4,8.92,26.18,8.92 +PRT,2008,8.3,9.28,27.56,9.28 +SVK,2003,7.4,2.05,9.12,2.05 +SVK,2004,7.3,3.72,10.24,3.72 +SVK,2005,7.3,4.28,11.35,4.28 +SVK,2006,7.2,4.47,12.28,4.47 +SVK,2007,7.0,5.77,13.77,5.77 +SVK,2008,6.9,6.13,13.76,6.13 +SVK,2009,6.7,6.13,13.37,6.13 +SVK,2010,6.6,6.86,14.1,6.86 +SVK,2011,6.3,7.04,15.0,7.04 +SVK,2012,6.2,6.29,15.53,6.29 +SVK,2013,6.2,6.65,15.33,6.65 +SVK,2014,7.0,8.3,17.35,8.3 +SVK,2015,6.9,8.85,17.88,8.85 +SVK,2016,6.8,9.02,17.31,9.02 +SVK,2017,6.8,9.56,17.28,9.56 +SVK,2018,6.7,9.55,18.36,9.55 +ESP,2010,6.4,11.98,15.95,11.98 +ESP,2011,6.2,13.76,16.64,13.76 +ESP,2012,6.1,14.77,17.19,14.77 +ESP,2013,6.1,15.34,17.59,15.34 +ESP,2014,6.0,15.51,17.6,15.51 +ESP,2015,6.1,15.85,18.02,15.85 +ESP,2016,6.0,16.09,18.31,16.09 +ESP,2017,6.0,16.38,18.65,16.38 +ESP,2018,6.0,17.2,19.12,17.2 +TUR,2002,5.8,0.88,4.89,0.88 +TUR,2003,5.7,1.48,5.63,1.48 +TUR,2004,5.6,2.2,6.6,2.2 +TUR,2005,5.3,2.91,7.44,2.91 +TUR,2006,5.1,4.47,8.56,4.47 +TUR,2007,4.4,5.84,9.62,5.84 +TUR,2008,4.1,7.91,10.68,7.91 +TUR,2009,4.1,8.68,11.63,8.68 +TUR,2010,4.0,9.27,12.36,9.27 +TUR,2011,3.9,9.55,13.12,9.55 +TUR,2012,3.9,9.58,13.53,9.58 +TUR,2013,3.9,9.86,13.89,9.86 +TUR,2014,4.0,9.81,13.88,9.81 +TUR,2015,3.9,10.15,14.31,10.15 +TUR,2016,4.0,10.55,14.53,10.55 +TUR,2017,4.1,11.01,14.77,11.01 +TUR,2018,4.1,11.24,14.88,11.24 +GBR,2001,7.7,6.21,6.88,6.21 +GBR,2002,7.5,4.99,7.29,4.99 +GBR,2003,7.3,4.54,6.91,4.54 +GBR,2004,7.1,5.0,7.02,5.0 +GBR,2005,6.9,5.4,7.45,5.4 +GBR,2006,6.6,5.62,7.53,5.62 +GBR,2008,6.3,5.5,7.26,5.5 +GBR,2010,6.1,6.55,7.92,6.55 +GBR,2011,6.0,6.96,8.48,6.96 +GBR,2012,6.0,7.16,9.09,7.16 +GBR,2013,6.0,7.2,9.3,7.2 +GBR,2014,6.0,7.23,9.46,7.23 +USA,1997,6.1,11.41,24.1,11.41 +USA,1999,5.9,13.19,25.09,13.19 +USA,2001,5.8,17.44,28.88,17.44 +USA,2003,5.7,19.32,29.26,19.32 +USA,2004,5.6,26.67,32.29,26.67 +USA,2006,5.6,26.58,34.02,26.58 +USA,2007,5.5,25.93,34.31,25.93 +USA,2012,5.4,34.46,43.89,34.46 +USA,2013,5.4,35.51,43.5,35.51 +USA,2014,5.5,38.12,41.05,38.12 +USA,2015,5.5,39.03,41.01,39.03 +USA,2016,5.5,36.74,41.88,36.74 +USA,2017,5.5,37.65,42.74,37.65 +EST,2005,6.0,2.21,7.38,2.21 +EST,2006,5.9,3.71,7.42,3.71 +EST,2007,5.9,5.22,11.19,5.22 +EST,2008,5.7,8.23,14.96,8.23 +EST,2009,5.6,7.49,14.99,7.49 +EST,2010,5.5,8.26,15.77,8.26 +EST,2011,5.5,9.79,16.57,9.79 +EST,2012,5.6,9.83,17.39,9.83 +EST,2013,6.0,11.38,18.97,11.38 +EST,2014,5.9,11.41,19.78,11.41 +EST,2015,6.0,12.16,16.72,12.16 +EST,2016,6.1,13.68,17.48,13.68 +EST,2017,6.1,13.66,18.22,13.66 +EST,2018,6.1,13.62,18.91,13.62 +ISR,2000,7.1,1.43,5.57,1.43 +ISR,2001,6.2,1.4,6.37,1.4 +ISR,2002,5.9,1.37,6.24,1.37 +ISR,2003,5.8,1.64,5.83,1.64 +ISR,2004,6.0,1.62,6.32,1.62 +ISR,2005,5.7,1.73,6.49,1.73 +ISR,2006,5.5,1.84,6.38,1.84 +ISR,2007,5.2,2.23,7.94,2.23 +ISR,2008,5.1,2.33,8.21,2.33 +ISR,2009,5.1,2.27,8.68,2.27 +ISR,2010,5.2,2.23,8.79,2.23 +ISR,2011,5.2,2.7,8.76,2.7 +ISR,2012,5.1,3.29,8.98,3.29 +ISR,2013,5.2,3.47,8.93,3.47 +ISR,2014,5.1,4.02,9.49,4.02 +ISR,2015,5.2,4.06,9.67,4.06 +ISR,2016,5.2,4.91,9.6,4.91 +ISR,2017,5.1,5.16,9.53,5.16 +ISR,2018,5.0,5.18,9.57,5.18 +RUS,1993,13.6,0.92,1.58,0.92 +RUS,1994,13.6,0.77,1.48,0.77 +RUS,1995,13.6,0.61,1.82,0.61 +RUS,1996,13.6,0.7,2.1,0.7 +RUS,1997,14.3,0.85,2.21,0.85 +RUS,1998,14.0,0.74,2.32,0.74 +RUS,1999,13.7,0.88,2.39,0.88 +RUS,2000,13.5,1.13,2.58,1.13 +RUS,2001,13.2,1.11,2.66,1.11 +RUS,2002,12.9,1.31,2.77,1.31 +RUS,2004,12.2,1.36,3.32,1.36 +RUS,2005,11.9,1.54,3.77,1.54 +RUS,2006,11.5,2.12,4.04,2.12 +RUS,2007,11.4,2.02,4.42,2.02 +RUS,2008,11.3,2.27,5.02,2.27 +RUS,2009,11.0,2.52,6.02,2.52 +RUS,2010,10.8,2.51,6.9,2.51 +RUS,2011,11.3,2.62,7.72,2.62 +RUS,2012,10.8,4.17,9.09,4.17 +RUS,2013,10.3,3.99,11.28,3.99 +RUS,2014,9.9,4.44,12.2,4.44 +RUS,2015,9.7,4.64,12.56,4.64 +RUS,2016,9.4,4.52,12.76,4.52 +RUS,2017,9.3,4.6,13.0,4.6 +RUS,2018,9.1,4.84,13.37,4.84 +SVN,2006,5.8,6.48,10.46,6.48 +SVN,2008,5.7,6.43,12.37,6.43 +SVN,2009,5.6,7.35,11.77,7.35 +SVN,2010,5.5,7.81,12.69,7.81 +SVN,2011,6.8,8.77,12.67,8.77 +SVN,2012,6.9,8.75,12.64,8.75 +SVN,2013,6.6,9.22,12.14,9.22 +SVN,2014,6.6,9.21,13.09,9.21 +SVN,2015,6.5,9.21,13.08,9.21 +SVN,2016,6.5,11.14,14.04,11.14 +SVN,2017,6.6,11.61,15.0,11.61 +SVN,2018,6.7,12.05,15.91,12.05 +ISL,2007,5.6,19.26,32.1,19.26 +ISL,2008,5.5,18.9,31.5,18.9 +ISL,2009,5.5,21.98,34.54,21.98 +ISL,2010,5.4,22.01,37.73,22.01 +ISL,2011,5.3,21.94,40.75,21.94 +ISL,2012,5.5,21.83,40.53,21.83 +ISL,2013,5.6,21.62,40.15,21.62 +ISL,2014,5.8,21.38,39.71,21.38 +ISL,2015,5.9,21.16,39.3,21.16 +ISL,2016,5.9,20.87,38.76,20.87 +ISL,2017,5.7,20.38,43.68,20.38 +ISL,2018,5.6,19.85,48.2,19.85 +LVA,2003,7.9,1.31,13.55,1.31 +LVA,2004,7.8,0.88,15.02,0.88 +LVA,2005,7.4,2.68,18.31,2.68 +LVA,2006,7.2,2.7,18.48,2.7 +LVA,2007,7.1,5.0,21.81,5.0 +LVA,2008,7.1,6.89,23.88,6.89 +LVA,2009,6.1,7.47,25.68,7.47 +LVA,2010,6.2,8.1,29.08,8.1 +LVA,2011,6.0,9.22,31.07,9.22 +LVA,2012,5.8,9.83,32.44,9.83 +LVA,2013,5.8,10.43,34.78,10.43 +LVA,2014,5.9,12.54,36.11,12.54 +LVA,2015,6.0,12.64,36.91,12.64 +LVA,2016,5.9,13.78,36.23,13.78 +LVA,2017,6.0,13.9,39.13,13.9 +LVA,2018,6.0,13.49,38.4,13.49 +LTU,2000,9.2,0.29,6.57,0.29 +LTU,2001,9.0,0.86,7.2,0.86 +LTU,2002,8.7,0.87,8.71,0.87 +LTU,2003,8.3,0.88,9.08,0.88 +LTU,2004,8.2,1.18,11.55,1.18 +LTU,2005,8.1,1.5,12.04,1.5 +LTU,2006,8.0,3.06,12.84,3.06 +LTU,2007,7.7,3.4,10.52,3.4 +LTU,2008,7.5,4.38,13.76,4.38 +LTU,2009,7.2,5.37,16.12,5.37 +LTU,2010,7.1,4.84,18.73,4.84 +LTU,2011,7.0,5.94,20.14,5.94 +LTU,2012,6.9,10.04,23.76,10.04 +LTU,2013,6.9,10.48,23.67,10.48 +LTU,2014,6.8,10.57,22.17,10.57 +LTU,2015,6.6,11.02,21.0,11.02 +LTU,2016,6.6,12.2,23.01,12.2 +LTU,2017,6.5,12.37,23.33,12.37 +LTU,2018,6.5,12.49,24.27,12.49 diff --git a/python_linear_regression/python_linear_regression.md b/python_linear_regression/python_linear_regression.md new file mode 100644 index 000000000..aa18d8dc0 --- /dev/null +++ b/python_linear_regression/python_linear_regression.md @@ -0,0 +1,557 @@ + + +# Python Lesson on Regression for Machine Learning + +@overview + + +## Summary of Key Concepts in Linear Regression + +- **Definition**: Linear regression is a statistical method used to model and analyze the relationships between a dependent variable and one or more independent variables. + +- **Applications**: Commonly used in machine learning to predict continuous outcomes. + +- **Practical Application**: + - Applying linear regression to real-world datasets, such as synthetic healthcare investments and the diabetes dataset. + +- **Evaluation and Beyond**: + - Recognize model limitations and explore further analysis, such as: + - Nonlinear relationships. + - Model assumptions. + - Feature selection and engineering. + - Regularization techniques (Ridge, Lasso). + - Advanced models (Random Forests, Gradient Boosting). + + +- Linear regression is a starting point for data analysis and machine learning. +- The foundation built here prepares you for advanced techniques and complex challenges. +- Success in data analysis involves understanding data, asking the right questions, and critically evaluating results. + + + + +## Python Implementation of Linear Regression + +To implement linear regression in Python using Scikit-learn, we can follow these steps: + + + +### 1. Import Libraries +**Description**: +This code block imports necessary libraries for data manipulation and machine learning tasks. Specifically, it imports NumPy for numerical operations, Pandas for data manipulation, and scikit-learn (sklearn) for machine learning functionalities. + +**Why this is important:**1 +Importing libraries is the first step in any data analysis or machine learning project. These libraries provide tools and functions to efficiently handle data, perform mathematical operations, and build machine learning models. + +* **numpy (np):** Provides tools for working with numerical arrays and mathematical operations. +* **pandas (pd):** Enables data manipulation and analysis with data structures like DataFrames. +* **sklearn:** A powerful machine learning library. We specifically use: + * `train_test_split`: Splits data into training (model building) and testing (model evaluation) sets. + * `StandardScaler`: Standardizes features to have zero mean and unit variance (often important for linear regression). + * `LinearRegression`: The core linear regression model. + + +```python +import numpy as np +import pandas as pd + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LinearRegression +``` +@Pyodide.eval + + +**Output:** +There's no output generated from this code block. It simply imports the required libraries for subsequent steps in the machine learning workflow. + + + +### 2. Load the data: + +**Description:** + +* `pd.read_csv("file")`: Reads data from a CSV file into a pandas DataFrame. +* `data.info()`: Gives a summary of the data such as column names, data types, and any missing values. + +**Why this is important:** +Loading the data is the initial step in any data analysis or machine learning task. It's essential to understand the structure of the data, such as the number of features and their data types, before proceeding with further analysis. + +```python @Pyodide.exec + +import pandas as pd +import io +from pyodide.http import open_url + +# URL of the CSV file +url = "https://raw.githubusercontent.com/arcus/education_modules/linear_regression/python_linear_regression/data/healthcare_investments_and_hospital_stay.csv" + +# Open and read the contents of the URL +url_contents = open_url(url) +text = url_contents.read() + +# Create a file-like object from the text content +file = io.StringIO(text) + +# Read the CSV data into a pandas DataFrame +data = pd.read_csv(file) + +# Analyze data and features +data.info() +``` + +**Output:** +After executing this code block, you will see a summary of the loaded data, including information about columns, data types, and non-null values. This helps them understand the dataset they will be working with. + + + + + +### 3. **The `onehot_encode` Function** + +**Description:** + + * This function handles categorical features (like "Location" in your data) by creating new columns where each column represents a unique category. The values are 1 if the data point belongs to that category and 0 otherwise. + +**Why this is important:** +One-hot encoding is crucial when dealing with categorical data in machine learning models. Many machine learning algorithms cannot directly handle categorical data, so encoding them into numerical values allows algorithms to operate on the data effectively. By creating binary columns for each category, we ensure that each category is treated equally, without imposing any ordinality or magnitude among them. + + +```python +def onehot_encode(df, column): + # Make a copy of the DataFrame to avoid modifying the original data + df = df.copy() + + # Use pandas get_dummies function to one-hot encode the specified column + dummies = pd.get_dummies(df[column]) + + # Concatenate the one-hot encoded columns with the original DataFrame + df = pd.concat([df, dummies], axis=1) + + # Drop the original categorical column since it's no longer needed + df = df.drop(column, axis=1) + + return df +``` +@Pyodide.eval + + + + + +### 4. Make Data Copy and One-Hot Encode + +**Description:** +The code creates a copy of the DataFrame df to ensure that the original data remains unchanged. It then applies one-hot encoding to the Location column using the onehot_encode function. + +**Why this is important:** +Creating a copy of the DataFrame is essential to prevent unintentional modifications to the original data, which could lead to unexpected results or loss of information. One-hot encoding is necessary to convert categorical variables, such as the Location column, into numerical format, which is required for many machine learning algorithms to operate effectively. + +* Creates a copy so we don't change the original data by accident. +* Applies one-hot encoding to the `Location` column. + +```python +# Make a copy of the DataFrame to avoid modifying the original data accidentally +df = df.copy() + +# Apply one-hot encoding to the 'Location' column +df = onehot_encode(df, column='Location') + +# Print the resulting DataFrame to observe the effect of one-hot encoding +print(df.head()) +``` +**Output:** +While this code snippet itself does not produce direct output, we can demonstrate its usage by applying it to a DataFrame and printing the resulting DataFrame to observe the effect of one-hot encoding. + + + + +### 5. Separate Target and Features +**Description:** + +* This code snippet separates the target variable (`Hospital_Stay`) from the features in the DataFrame df. +* The target variable (`y`) is what we want to predict, while the features (`X`) are the information we'll use to make the prediction. + +**Why this is important:** + +* Separating the target variable from the features is a crucial step in machine learning model training. +* The target variable is the variable we aim to predict, while the features are the input variables that influence the prediction. +* By separating them, we ensure that the model trains on the features to predict the target accurately. + + +```python +# Separate the target variable 'Hospital_Stay' from the features +y = df['Hospital_Stay'].copy() +X = df.drop('Hospital_Stay', axis=1).copy() + +# Print the target variable and features to verify the separation +print("Target variable (y):") +print(y.head()) +print("\nFeatures (X):") +print(X.head()) + +``` + +**Output:** +While this code snippet doesn't produce any visible output, we can verify the separation by printing the y (target variable) and X (features) variables. + + + +### 6. Split into Training and Testing Sets +**Description:** +The `train_test_split` function divides the dataset into training and testing sets. Here, 70% of the data is used for training `(X_train, y_train)`, and the remaining 30% is held back for testing `(X_test, y_test)`. + +**Why this is important:** +Splitting the data into training and testing sets is crucial in machine learning to assess the performance of the model. The training set is used to train the model, while the testing set is used to evaluate its performance on unseen data. This helps to detect overfitting and ensures that the model generalizes well to new data. + +* `random_state=123` ensures we get the same split each time for reproducibility. + + +```python +# Split the data into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123) + +# Print the shapes of the resulting training and testing sets +print("Training set - X shape:", X_train.shape, "y shape:", y_train.shape) +print("Testing set - X shape:", X_test.shape, "y shape:", y_test.shape) + +``` +**Output:** +While this code block doesn't produce any output directly, we can demonstrate its usage by applying it to our data and printing the shapes of the resulting training and testing sets to confirm the split. + + +### 7. Standardize Features +**Description:** + +* The code initializes a `StandardScaler` object, which will be used to standardize (or z-score normalize) the features. +* It then fits the scaler to the training data (`X_train`), calculating the mean and standard deviation of each feature in the training set. +* Finally, it scales both the training and testing data to have zero mean and unit variance using the fitted scaler. This ensures that both datasets are scaled in the same way. + +**Why this is important:** +Standardizing features is crucial, especially when working with algorithms that rely on distance metrics or gradient descent optimization, such as KNN, SVM, or logistic regression. By standardizing the features, we remove the mean and scale the data to unit variance, which can improve the convergence rate of optimization algorithms and prevent features with larger scales from dominating those with smaller scales. + +```python +# Initialize a StandardScaler object +scaler = StandardScaler() + +# Fit the scaler to the training data, calculating the mean and standard deviation of each feature +scaler.fit(X_train) + +# Scale both training and testing data to have zero mean and unit variance +X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns) +X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns) + +# Print the scaled training and testing data to observe the effect of standardization +print("Scaled Training Data:") +print(X_train.head()) +print("\nScaled Testing Data:") +print(X_test.head()) +``` + +**Output:** +While this code block doesn't produce any output directly, learners can observe the effect of standardization by printing the scaled `X_train` and `X_test` datasets after applying the scaler. + + + +### 8. Create and Train the Model +**Description:** + +* This code segment creates a linear regression object using the `LinearRegression` class from the scikit-learn library. +* It then fits the model to the training data, finding the best-fit line (or plane, in higher dimensions) by minimizing the difference between predicted and actual values in the training data. + +**Why this is important:** +Creating and training a model is the core of supervised machine learning. In this step, we instantiate a regression model and train it on our training data to learn the underlying patterns and relationships between the input features (`X`) and the target variable (`y`). This trained model will later be used to make predictions on new, unseen data. + +```python +# Create a Linear Regression model object +model = LinearRegression() + +# Fit the model to the training data +model.fit(X_train, y_train) +``` +@Pyodide.eval + + + +### 9. Make Predictions +**Description:** +This line applies the trained machine learning model (`model`) to the testing data (`X_test`) to make predictions about hospital stay durations. + +**Why this is important:** +Making predictions is the ultimate goal of any machine learning model. By applying the trained model to new, unseen data, we can obtain predictions that can be used for decision-making or further analysis. + +```python +# Make predictions using the trained model and the testing data +y_pred = model.predict(X_test) + +# Print the predicted hospital stay durations +print(y_pred) +``` +@Pyodide.eval + +**Output:** +While this line doesn't produce output directly, we can add a print statement to display the predictions generated by the model. + + + +### 10. Evaluate the Model +**Description:** + +The code calculates and prints two evaluation metrics for the regression model: + +* Mean Squared Error (MSE): A measure of how close the predictions are to the actual values on average. Lower values indicate better performance. +* R² Score: Indicates the proportion of variance in the target variable that is explained by the model. Ranges from 0 to 1, with 1 being the best possible score. + +**Why this is important:** +Evaluating the model's performance is crucial to understand how well it is generalizing to unseen data. The Mean Squared Error provides a quantitative measure of the model's prediction accuracy, while the R² Score gives insight into the goodness of fit of the model. + + +```python +mse = np.mean((y_pred - y_test)**2) +print("MSE:", mse) +print(" R^2 Score: {:.5f}".format(model.score(X_test, y_test))) +``` + +**Output:** +This code snippet produces output showing the calculated MSE and R² Score, providing insights into the model's performance. + + +### Code Overview and Tips +This is a basic example of how to implement linear regression in Python using Scikit-learn. There are many other ways to implement linear regression in Python, but this is a good starting point. + +Here are some additional tips for implementing linear regression in Python: + +- Make sure to scale the data before training the model. This will help to ensure that all features have equal importance in the model. +- Use a validation set to evaluate the model and tune the hyperparameters. This will help to prevent overfitting. +- Use regularization techniques, such as L1 or L2 regularization, to prevent overfitting. +- Interpret the coefficients of the linear regression model to understand the relationship between the predictor variables and the target variable. + + + + +## Review your knowledge + +Which function from Scikit-learn is used to split the dataset into training and testing sets? + + +A) data_splitter +B) train_test_split +C) train_validate_split +D) model_splitter + + +[( )] `data_splitter` +[(X)] `train_test_split` +[( )] `train_validate_split` +[( )] `model_splitter` +*** +
+ +The `train_test_split` function from Scikit-learn is used to split the dataset into training and testing sets. This function is essential for evaluating the performance of a machine learning model by training it on one subset of the data and testing it on another. + + +
+ + + + + + + + +## Conclusion + +### Key Takeaways +By the end of this module, you'll have gained a solid grasp of linear regression as it is used in machine learning. You've learned how to implement and evaluate linear regression models using popular libraries like Scikit-learn. You've also seen how to apply these techniques to real-world datasets, both synthetic (healthcare investments) and established (diabetes dataset). + +While the linear regression model for the diabetes dataset explains a reasonable amount of variance (51.8%), it's important to remember that real-world data analysis rarely ends with a single model. + +### Beyond Linear Regression +**Further Analysis Needed:** + +* **Explore potential nonlinear relationships:** The relationship between diabetes progression and the predictor variables might not be strictly linear. +* **Evaluate model assumptions:** Linear regression assumes specific relationships between variables (e.g., linearity, independence, homoscedasticity) that may not hold in the data. +* **Feature selection and engineering:** Some predictors might be more important than others. Feature engineering techniques could create new, more informative features. +* **Regularization:** Techniques like Ridge or Lasso regression could help prevent overfitting and improve model generalization. +* **Advanced models:** Non-linear regression or machine learning models like Random Forests or Gradient Boosting might offer better predictive performance. + +This module is just the beginning of your journey into data analysis and machine learning. With the foundation you've built here, you're well-prepared to explore more advanced techniques and tackle complex data-driven challenges. + +Remember, the key to successful data analysis is not just about applying algorithms, but also about understanding your data, asking the right questions, and critically evaluating your results. As you continue learning, keep exploring, experimenting, and refining your skills to become a proficient data scientist. + + +## Additional Resources + +### Full Code Implementation + +At the end of this module, here you will find a "Full Code" section where all the code is consolidated into a single cell block. This allows for easy copying and pasting for those who want to implement the entire process quickly. While this single block of code isn't designed as a step-by-step educational tool, it serves as a convenient reference for future use and helps streamline the process for those already familiar with the concepts. Below is the complete code implementation: + + +```python +import numpy as np +import pandas as pd + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LinearRegression + +def onehot_encode(df, column): + # Make a copy of the DataFrame to avoid modifying the original data + df = df.copy() + + # Use pandas get_dummies function to one-hot encode the specified column + dummies = pd.get_dummies(df[column]) + + # Concatenate the one-hot encoded columns with the original DataFrame + df = pd.concat([df, dummies], axis=1) + + # Drop the original categorical column since it's no longer needed + df = df.drop(column, axis=1) + + return df + +# URL of the CSV file +url = "https://raw.githubusercontent.com/arcus/education_modules/linear_regression/python_linear_regression/data/healthcare_investments_and_hospital_stay.csv" + +# Open and read the contents of the URL +url_contents = open_url(url) +text = url_contents.read() + +# Create a file-like object from the text content +file = io.StringIO(text) + +# Read the CSV data into a pandas DataFrame +data = pd.read_csv(file) + +# Analyze data and features +data.info() + +# Make a copy of the DataFrame to avoid modifying the original data accidentally +df = data.copy() + +# Apply one-hot encoding to the 'Location' column +df = onehot_encode(df, column='Location') + +# Print the resulting DataFrame to observe the effect of one-hot encoding +print(df.head()) + +# Separate the target variable 'Hospital_Stay' from the features +y = df['Hospital_Stay'].copy() +X = df.drop('Hospital_Stay', axis=1).copy() + +# Print the target variable and features to verify the separation +print("Target variable (y):") +print(y.head()) +print("\nFeatures (X):") +print(X.head()) + +# Split the data into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123) + +# Print the shapes of the resulting training and testing sets +print("Training set - X shape:", X_train.shape, "y shape:", y_train.shape) +print("Testing set - X shape:", X_test.shape, "y shape:", y_test.shape) + +# Initialize a StandardScaler object +scaler = StandardScaler() + +# Fit the scaler to the training data, calculating the mean and standard deviation of each feature +scaler.fit(X_train) + +# Scale both training and testing data to have zero mean and unit variance +X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns) +X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns) + +# Print the scaled training and testing data to observe the effect of standardization +print("Scaled Training Data:") +print(X_train.head()) +print("\nScaled Testing Data:") +print(X_test.head()) + +# Create a Linear Regression model object +model = LinearRegression() + +# Fit the model to the training data +model.fit(X_train, y_train) + +# Make predictions using the trained model and the testing data +y_pred = model.predict(X_test) + +# Print the predicted hospital stay durations +print(y_pred) + +# Calculate Mean Squared Error (MSE) +mse = np.mean((y_pred - y_test)**2) + +# Print MSE +print("MSE:", mse) + +# Calculate and print R² Score +print("R² Score:", model.score(X_test, y_test)) +``` + +## Feedback + +@feedback diff --git a/python_linear_regression/python_linear_regression_exercise.ipynb b/python_linear_regression/python_linear_regression_exercise.ipynb new file mode 100644 index 000000000..e1c6e9407 --- /dev/null +++ b/python_linear_regression/python_linear_regression_exercise.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "qJomtu5Ddh1h" + }, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XO4ZsjHac4MD" + }, + "source": [ + "**Real World Code Example: Diabetes Progression Prediction**\n", + "\n", + "\n", + "This notebook demonstrates a basic linear regression analysis on a diabetes dataset to predict disease progression. The dataset includes information on 442 patients, their medical attributes, and a quantitative measure of disease advancement after one year." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vnNsDB28djkH" + }, + "source": [ + "# Data Description" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gyZ61b75c7iQ" + }, + "source": [ + "The data includes:\n", + "\n", + "* **Predictor Variables:**\n", + " * Age (years)\n", + " * Sex\n", + " * Body Mass Index (BMI)\n", + " * Average Blood Pressure\n", + " * Six Blood Serum Measurements (normalized)\n", + "* **Target Variable:**\n", + " * Quantitative measure of disease progression one year after baseline\n", + "\n", + "\n", + "Each feature variable has been mean-centered and scaled by the standard deviation times the square root of the number of samples.\n", + "\n", + "**Citation:**\n", + "\n", + "This dataset is sourced from the research paper \"Least Angle Regression\" by Bradley Efron, Trevor Hastie, Iain Johnstone, and Robert Tibshirani (Annals of Statistics, 2004)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P-ZP5ZLWdTaE" + }, + "source": [ + "# Install and Import:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HKf3oUgwc7I1" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn import datasets\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_squared_error, r2_score" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cY3m-DOddRFc" + }, + "source": [ + "# Load and Explore Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JP4lt_lvdNwH" + }, + "outputs": [], + "source": [ + "# Load the diabetes dataset\n", + "diabetes = datasets.load_diabetes()\n", + "\n", + "# Print dataset description\n", + "print(diabetes.DESCR)\n", + "\n", + "# Separate features (X) and target variable (Y)\n", + "X = diabetes.data\n", + "Y = diabetes.target" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r2Nyyk_FdYdg" + }, + "source": [ + "# Build and Train Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qgNlTeu1dPKg" + }, + "outputs": [], + "source": [ + "# Create Linear Regression model\n", + "model = LinearRegression()\n", + "\n", + "# Train the model on the data\n", + "model.fit(X, Y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TZUOu5k6db7T" + }, + "source": [ + "# Predict and Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LE-F0aOpdeaS" + }, + "outputs": [], + "source": [ + "# Make predictions\n", + "predictions = model.predict(X)\n", + "\n", + "# Model Coefficients and Intercept\n", + "print('Coefficients:', model.coef_)\n", + "print('Intercept:', model.intercept_)\n", + "\n", + "# Evaluate performance using R-squared and Mean Squared Error\n", + "print('R-squared:', r2_score(Y, predictions))\n", + "print('Mean Squared Error:', mean_squared_error(Y, predictions))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BmBkosEIdfzt" + }, + "source": [ + "# Interpretation and Next Steps" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "41PlZvIgdpRR" + }, + "source": [ + "This basic linear regression model explains approximately 51.8% of the variance in disease progression. However, the mean squared error indicates room for improvement.\n", + "\n", + "\n", + "**Future Directions:**\n", + "\n", + "* **Explore non-linear relationships:** Consider non-linear models (e.g., polynomial regression).\n", + "* **Feature selection/engineering:** Identify the most relevant predictors.\n", + "* **Regularization:** Prevent overfitting by adding penalty terms to the model.\n", + "* **Cross-validation:** Assess the model's performance on unseen data.\n", + "* **Advanced techniques:** Explore machine learning algorithms like Random Forests or Gradient Boosting." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}