diff --git a/baseball/hit-location-diagram.ipynb b/baseball/hit-location-diagram.ipynb index 3f704ef..d99bd50 100644 --- a/baseball/hit-location-diagram.ipynb +++ b/baseball/hit-location-diagram.ipynb @@ -1,15 +1,40 @@ { "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Baseball hit diagram\n", + "This example notebook uses pandas and matplot to breakdown the most common location for a strike out.\n", + "\n", + "// Need to add more details about how arrays are manipulated to create sections.\n", + "\n", + "// Should we delete the results of python cells so new users don't know the result?" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import needed modules" + ] + }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "ok\n" + "ename": "ModuleNotFoundError", + "evalue": "No module named 'pandas'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[1], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnumpy\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mnp\u001b[39;00m \n\u001b[1;32m----> 2\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \n\u001b[0;32m 3\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mre\u001b[39;00m\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'pandas'" ] } ], @@ -26,6 +51,14 @@ "print('ok')" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Back up MLB season from bigdata/baseball" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -53,6 +86,14 @@ "print('ok')" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Breakdown the results of each play" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -133,36 +174,13 @@ ] }, { - "cell_type": "code", - "execution_count": 4, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# Total events in 2022: 214,038\n", - "# \n", - "# event_type\n", - "# Field out 82804\n", - "# Strikeout 40812\n", - "# Field hit 34460\n", - "# No play 26673\n", - "# Walk 16973\n", - "# Non-batter event 5278\n", - "# Home run 5215\n", - "# Error 1118\n", - "# Fielders choice 705\n", - "# dtype: int64\n", - "\n", - "\n", - "# batter_hand (overall)\n", - "# R 128878\n", - "# L 85160\n", - "# dtype: int64\n", - "\n", - "\n", - "# batter_hand (on fielder outs)\n", - "# R 49731\n", - "# L 33073\n", - "# dtype: int64" + "# Filter out \"\" plays\n", + "\n", + "TODO(KoT17): Breakdown of what each acronym means" ] }, { @@ -226,93 +244,69 @@ "print('loading out info for field outs...')\n", "events_df.loc[events_df['event_type'] == 'Field out', 'out_info'] = events_df[events_df['event_type'] == 'Field out']['theplay'].apply(lambda x: '/'.join(str(x).split('/')[1:]))\n", "\n", - "\n", - "# print(events_df.loc[14890286]['out_info'])\n", - "# print(events_df.loc[12601834]['out_info'])\n", - "\n", "dp_re = re.compile('(\\/)*(L|G|N)*(D|T)+(P)+(\\/)*')\n", "ix = events_df[\n", " (events_df['event_type'] == 'Field out') \n", " & (events_df['out_info'].apply(lambda x: re.search(dp_re, str(x)) != None))\n", "].index\n", - "# yix = events_df[events_df.index.isin(ix)].sample(10).index\n", - "# print(events_df[events_df.index.isin(yix)]['out_info'])\n", "\n", "print('\\n')\n", "print('removing (N)DP/TP (n = {0:,.0f})...'.format(len(ix)))\n", "events_df.loc[ix, 'edited'] = True\n", "events_df.loc[ix, 'out_info'] = events_df[events_df.index.isin(ix)\n", " ]['out_info'].apply(lambda x: re.sub(dp_re, '', str(x)))\n", - "#print('----------------------------------------------')\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", "\n", "int_re = re.compile('(\\/)*(R|B)*(INT)+(\\/)*')\n", "ix = events_df[\n", " (events_df['event_type'] == 'Field out') \n", " & (events_df['out_info'].apply(lambda x: re.search(int_re, str(x)) != None))\n", "].index\n", - "print('removing INT (n = {0:,.0f})...'.format(len(ix)))\n", - "# yix = events_df[events_df.index.isin(ix)].sample(10).index\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", "\n", + "print('removing INT (n = {0:,.0f})...'.format(len(ix)))\n", "events_df.loc[ix, 'edited'] = True\n", "events_df.loc[ix, 'out_info'] = events_df[events_df.index.isin(ix)\n", - " ]['out_info'].apply(lambda x: re.sub(int_re, '', str(x)))# print('----------------------------------------------')\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", + " ]['out_info'].apply(lambda x: re.sub(int_re, '', str(x)))\n", "\n", "fo_re = re.compile('(\\/)*(FO)+(\\/)*')\n", "ix = events_df[\n", " (events_df['event_type'] == 'Field out') \n", " & (events_df['out_info'].apply(lambda x: re.search(fo_re, str(x)) != None))\n", "].index\n", - "print('removing FO (n = {0:,.0f})...'.format(len(ix)))\n", - "yix = events_df[events_df.index.isin(ix)].sample(10).index\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", "\n", + "print('removing FO (n = {0:,.0f})...'.format(len(ix)))\n", "events_df.loc[ix, 'edited'] = True\n", "events_df.loc[ix, 'out_info'] = events_df[events_df.index.isin(ix)\n", - " ]['out_info'].apply(lambda x: re.sub(fo_re, '', str(x)))# print('----------------------------------------------')\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", + " ]['out_info'].apply(lambda x: re.sub(fo_re, '', str(x)))\n", "\n", "if_sf_re = re.compile('(\\/)*(IF|SF|SH)+(\\/)*')\n", "ix = events_df[(events_df['event_type'] == 'Field out') & (events_df['out_info'].apply(lambda x: re.search(if_sf_re, str(x)) != None))].index\n", + "\n", "print('removing IF/SF/SH (n = {0:,.0f})...'.format(len(ix)))\n", - "yix = events_df[events_df.index.isin(ix)].sample(10).index\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", "events_df.loc[ix, 'edited'] = True\n", "events_df.loc[ix, 'out_info'] = events_df[events_df.index.isin(ix)\n", - " ]['out_info'].apply(lambda x: re.sub(if_sf_re, '', str(x)))# print('----------------------------------------------')\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", + " ]['out_info'].apply(lambda x: re.sub(if_sf_re, '', str(x)))\n", "\n", "fl_re = re.compile('(\\/)*(FL)+(\\/)*')\n", "ix = events_df[\n", " (events_df['event_type'] == 'Field out') \n", " & (events_df['out_info'].apply(lambda x: re.search(fl_re, str(x)) != None))\n", "].index\n", - "yix = events_df[events_df.index.isin(ix)].sample(10).index\n", - "#print('\\n')\n", + "\n", "print('removing FL (n = {0:,.0f})...'.format(len(ix)))\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", "events_df.loc[ix, 'edited'] = True\n", "events_df.loc[ix, 'out_info'] = events_df[events_df.index.isin(ix)\n", - " ]['out_info'].apply(lambda x: re.sub(fl_re, '', str(x)))# print('----------------------------------------------')\n", - "# #print('-------------------------------')\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", + " ]['out_info'].apply(lambda x: re.sub(fl_re, '', str(x)))\n", "\n", "th_re = re.compile('(\\/)*(TH)+(\\/)*')\n", "ix = events_df[\n", " (events_df['event_type'] == 'Field out') \n", " & (events_df['out_info'].apply(lambda x: re.search(th_re, str(x)) != None))\n", "].index\n", - "#yix = events_df[events_df.index.isin(ix)].index\n", - "#print('\\n')\n", + "\n", "print('removing TH (n = {0:,.0f})...'.format(len(ix)))\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", "events_df.loc[ix, 'edited'] = True\n", "events_df.loc[ix, 'out_info'] = events_df[events_df.index.isin(ix)\n", - " ]['out_info'].apply(lambda x: re.sub(th_re, '', str(x)))# print('----------------------------------------------')\n", - "# #print('-------------------------------')\n", - "#print(events_df[events_df.index.isin(yix)]['out_info'])\n", + " ]['out_info'].apply(lambda x: re.sub(th_re, '', str(x)))\n", "\n", "ix = events_df[events_df['theplay'] == '54/SH/25'].index\n", "print('removing some special cases (n = {0:,.0f})...'.format(len(ix)))\n", @@ -346,10 +340,15 @@ "\n", "e = time.time()\n", "print('\\n')\n", - "print('Done in {0:.0f} seconds!'.format(e-s))\n", - "\n", - "\n", - "events_df[(events_df['event_type'] == 'Field out') & (events_df['edited'] == False)]['out_info']\n" + "print('Done in {0:.0f} seconds!'.format(e-s))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Determine the out type and location" ] }, { @@ -390,9 +389,15 @@ "\n", "e = time.time()\n", "print('\\n')\n", - "print('Done in {0:.0f} seconds!'.format(e-s))\n", - "\n", - "#events_df[(events_df['event_type'] == 'Field out') & (events_df['out_info'].apply(lambda x: len(str(x)) > 1))]['out_info'].apply(lambda x: )" + "print('Done in {0:.0f} seconds!'.format(e-s))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Populate plot with amount of outs" ] }, { @@ -523,8 +528,8 @@ " if (not(np.isnan(rightdata[i,j]))):\n", " rightsumtotal += rightdata[i,j]\n", " \n", - "print('\\tn = {0:,.0f}'.format(rightsumtotal))\n", - "print('\\t\\tChecksum = {0:,.0f}'.format(rdf.to_numpy().sum()))\n", + "print('n = {0:,.0f}'.format(rightsumtotal))\n", + "print('Checksum = {0:,.0f}'.format(rdf.to_numpy().sum()))\n", "\n", "\n", "print('\\n')\n", @@ -597,123 +602,17 @@ "rightdata_bk = rightdata\n", "leftdata_bk = leftdata\n", "\n", - "print('\\tn = {0:,.0f}'.format(leftsumtotal))\n", - "print('\\t\\tChecksum = {0:,.0f}'.format(ldf.to_numpy().sum()))\n", + "print('n = {0:,.0f}'.format(leftsumtotal))\n", + "print('Checksum = {0:,.0f}'.format(ldf.to_numpy().sum()))\n", "\n" ] }, { - "cell_type": "code", - "execution_count": 9, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ok\n" - ] - } - ], "source": [ - "def heatmap(data, ax=None,\n", - " cbar_kw=None, cbarlabel=\"\", **kwargs):\n", - " \"\"\"\n", - " Create a heatmap from a numpy array and two lists of labels.\n", - "\n", - " Parameters\n", - " ----------\n", - " data\n", - " A 2D numpy array of shape (M, N).\n", - " row_labels\n", - " A list or array of length M with the labels for the rows.\n", - " col_labels\n", - " A list or array of length N with the labels for the columns.\n", - " ax\n", - " A `matplotlib.axes.Axes` instance to which the heatmap is plotted. If\n", - " not provided, use current axes or create a new one. Optional.\n", - " cbar_kw\n", - " A dictionary with arguments to `matplotlib.Figure.colorbar`. Optional.\n", - " cbarlabel\n", - " The label for the colorbar. Optional.\n", - " **kwargs\n", - " All other arguments are forwarded to `imshow`.\n", - " \"\"\"\n", - "\n", - " if ax is None:\n", - " ax = plt.gca()\n", - "\n", - " if cbar_kw is None:\n", - " cbar_kw = {}\n", - "\n", - " # Plot the heatmap\n", - " im = ax.imshow(data, **kwargs)\n", - "\n", - " # Create colorbar\n", - " cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)\n", - " cbar.ax.set_xlabel(cbarlabel)\n", - "\n", - " return im, cbar\n", - "\n", - "\n", - "def annotate_heatmap(im, data=None, valfmt=\"{x:.2f}\",\n", - " textcolors=(\"black\", \"white\"),\n", - " threshold=None, **textkw):\n", - " \"\"\"\n", - " A function to annotate a heatmap.\n", - "\n", - " Parameters\n", - " ----------\n", - " im\n", - " The AxesImage to be labeled.\n", - " data\n", - " Data used to annotate. If None, the image's data is used. Optional.\n", - " valfmt\n", - " The format of the annotations inside the heatmap. This should either\n", - " use the string format method, e.g. \"$ {x:.2f}\", or be a\n", - " `matplotlib.ticker.Formatter`. Optional.\n", - " textcolors\n", - " A pair of colors. The first is used for values below a threshold,\n", - " the second for those above. Optional.\n", - " threshold\n", - " Value in data units according to which the colors from textcolors are\n", - " applied. If None (the default) uses the middle of the colormap as\n", - " separation. Optional.\n", - " **kwargs\n", - " All other arguments are forwarded to each call to `text` used to create\n", - " the text labels.\n", - " \"\"\"\n", - "\n", - " if not isinstance(data, (list, np.ndarray)):\n", - " data = im.get_array()\n", - "\n", - " # Normalize the threshold to the images color range.\n", - " if threshold is not None:\n", - " threshold = im.norm(threshold)\n", - " else:\n", - " threshold = im.norm(data.max())/2.\n", - "\n", - " # Set default alignment to center, but allow it to be\n", - " # overwritten by textkw.\n", - " kw = dict(horizontalalignment=\"center\",\n", - " verticalalignment=\"center\")\n", - " kw.update(textkw)\n", - "\n", - " # Get the formatter in case a string is supplied\n", - " if isinstance(valfmt, str):\n", - " valfmt = mpl.ticker.StrMethodFormatter(valfmt)\n", - "\n", - " # Loop over the data and create a `Text` for each \"pixel\".\n", - " # Change the text's color depending on the data.\n", - " texts = []\n", - " for i in range(data.shape[0]):\n", - " for j in range(data.shape[1]):\n", - " kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])\n", - " text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)\n", - " texts.append(text)\n", - "\n", - " return texts\n", - "print('ok')" + "# Plot out heatmap for left and right handed batters" ] }, { @@ -723,7 +622,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -744,7 +643,7 @@ "oflabels = ['', '7LF', '7L', '7', '78', '8', '89', '9', '9L', '9LF']\n", "iflabels = ['5F', '5', '56', '6', '6M', '4M', '4', '34', '3', '3F']\n", "\n", - "im, _ = heatmap(leftdata, ax=ax,\n", + "im, _ = plt.heatmap(leftdata, ax=ax,\n", " cmap=cmapname, cbarlabel=\"Outs to sector\")#, cbar_kw={'location': 'bottom'})\n", "#annotate_heatmap(im, valfmt=\"{x:.0f}\", size=7, textcolors=thelabelcolors)\n", "\n", @@ -753,7 +652,7 @@ "#ax.spines[:].set_visible(False)\n", "ax.set_title('LEFT-handed batters')\n", "\n", - "im, _ = heatmap(rightdata, ax=ax2,\n", + "im, _ = plt.heatmap(rightdata, ax=ax2,\n", " cmap=cmapname, cbarlabel=\"Outs to sector\")#, cbar_kw={'location': 'bottom'})\n", "#annotate_heatmap(im, valfmt=\"{x:.0f}\", size=7, textcolors=thelabelcolors)\n", "\n", @@ -777,9 +676,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.8 (py38)", + "display_name": "Python 3", "language": "python", - "name": "py38" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -791,7 +690,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.0" + }, + "vscode": { + "interpreter": { + "hash": "a8712b7f88d470a3450a2747036c2653d8e7da53ae1b559f2a80ca921b7ec002" + } } }, "nbformat": 4,