wokhouse · wokhouse · Feb 26, 2023 · Feb 26, 2023
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -7,13 +7,13 @@
     "@testing-library/jest-dom": "^5.16.5",
     "@testing-library/react": "^13.4.0",
     "@testing-library/user-event": "^13.5.0",
+    "@tidyjs/tidy": "^2.5.2",
     "@types/jest": "^27.5.2",
     "@types/node": "^16.18.12",
     "@types/react": "^18.0.28",
     "@types/react-dom": "^18.0.11",
     "await-to-js": "^3.0.0",
     "bootstrap": "^5.3.0-alpha1",
-    "danfojs": "^1.1.2",
     "echarts-for-react": "^3.0.2",
     "react": "^18.2.0",
     "react-dom": "^18.2.0",

diff --git a/src/analysis/computeAccuracy.ts b/src/analysis/computeAccuracy.ts
@@ -1,15 +1,18 @@
-import { DataFrame } from "danfojs";
+import { tidy, filter, tally } from "@tidyjs/tidy";
+import { LongSessionRow } from "../types";
+
+const computeAccuracy = (session: LongSessionRow[]): number => {
+  const correct: number = tidy(
+    session,
+    filter(({ correct }) => correct === true),
+    tally()
+  )[0]["n"];
+  const incorrect: number = tidy(
+    session,
+    filter(({ correct, key }) => correct === false && key !== "Backspace"),
+    tally()
+  )[0]["n"];
 
-const computeAccuracy = (session: DataFrame): number => {
-  // get everything that ins't backspace
-  const mask = session["key"].values.map((k: string) => k !== "Backspace");
-  const correctMask = session.query(mask)["correct"].values;
-  const incorrectMask = session
-    .query(mask)
-    ["correct"].values.map((v: boolean) => v === false);
-  const correct = session.query(correctMask).values.length;
-  const incorrect = session.query(incorrectMask).values.length;
-  // console.log(correctMask, incorrectMask);
   return +(correct / (correct + incorrect)).toFixed(2);
 };
 

diff --git a/src/analysis/computeNGram.ts b/src/analysis/computeNGram.ts
@@ -1,31 +1,49 @@
-import { DataFrame } from "danfojs";
-import { nGramFeedback, WordPerformance } from "../types";
+import { tidy, filter, select } from "@tidyjs/tidy";
+import { createFinalOutput } from "../bin/createFinalOutput";
+import { pullColumns } from "../bin/pullColumns";
+import { LongSessionRow, nGramFeedback, WordPerformance } from "../types";
 
-const transformIntoWords = (input: DataFrame): WordPerformance[] => {
-  // create list of words w/ array of targetID's
+interface NGramTarget {
+  nGram: string;
+  startIndex: number;
+  endIndex: number;
+}
+
+const transformIntoWords = (input: LongSessionRow[]): WordPerformance[] => {
+  // create list of words w/ list of each words instance's targetID's
   const targets: { [key: string]: number[] } = {};
-  input.values.forEach((rv: any, rowIndex: number) => {
-    const row = input.iloc({ rows: [rowIndex] });
-    const { target: targetValues, targetID: targetIDValues } = row;
-    const target = targetValues.values[0];
-    const targetID = targetIDValues.values[0];
-    if (targets[target] === undefined) targets[target] = [targetID];
+  // create separate array of words so we can easily iterate through
+  const targetWords: string[] = [];
+  input.forEach((row: LongSessionRow) => {
+    const { target, targetID } = row;
+    // if this is a new word, then create a new key/value pair in the targets object
+    if (targets[target] === undefined) {
+      targets[target] = [targetID];
+      targetWords.push(target);
+    }
     // only add targetID to array once
     else if (targets[target].indexOf(targetID) < 0)
       targets[target] = [...targets[target], targetID];
   });
 
   const words: WordPerformance[] = [];
-  Object.keys(targets).forEach((target) => {
+  // create array that contains each word and every attempt made to type the word
+  targetWords.forEach((target) => {
     const thisWordPerf: WordPerformance = { target, attempts: [] };
 
     targets[target].forEach((targetID) => {
-      const row = input.query(input["targetID"].eq(targetID));
-      const { correct, timestamp, key } = row;
+      const row = tidy(
+        input,
+        filter(({ targetID: searchID }) => searchID === targetID)
+      );
+      const [key, correct, timestamp] = pullColumns(
+        tidy(row, select(["key", "correct", "timestamp"])),
+        ["key", "correct", "timestamp"]
+      ) as [string[], boolean[], number[]];
       thisWordPerf.attempts.push({
-        key: key.values,
-        correct: correct.values,
-        timestamps: timestamp.values,
+        key,
+        correct,
+        timestamp,
       });
     });
     words.push(thisWordPerf);
@@ -34,29 +52,6 @@ const transformIntoWords = (input: DataFrame): WordPerformance[] => {
   return words;
 };
 
-interface FinalOutputItem {
-  key: string;
-  timestamp: number;
-}
-// we need to get the final output i.e. the key events minus the ones that were followed by backspace
-const createFinalOutput = (
-  keys: string[],
-  timestamp: number[]
-): FinalOutputItem[] => {
-  let out: FinalOutputItem[] = [];
-  keys.forEach((k, i) => {
-    if (k !== "Backspace") out.push({ key: k, timestamp: timestamp[i] });
-    else out = out.slice(0, out.length - 1);
-  });
-  return out;
-};
-
-interface NGramTarget {
-  nGram: string;
-  startIndex: number;
-  endIndex: number;
-}
-
 // get corresponding performance data for a given nGram
 const dumpNGrams = (
   nGramsInTarget: NGramTarget[],
@@ -97,18 +92,18 @@ const createNGramsFromWords = (
       }
     );
     const individualNGramPerformance = attempts.flatMap(
-      ({ key, correct, timestamps }): nGramFeedback[] => {
+      ({ key, correct, timestamp }): nGramFeedback[] => {
         // if ever key press was correct, then we can just dump that into the nGram object
         // if there was a miss, we need to figure out what we can work with
         // the simplest is when there's just a mistyped character but the string is still the same length as the target
         if (correct.indexOf(false) <= -1 || target.length === key.length) {
-          return dumpNGrams(nGramsInTarget, correct, timestamps);
+          return dumpNGrams(nGramsInTarget, correct, timestamp);
         }
         // if there's backspaces, then we need to create the full visual history so we can calculate the nGram duration
-        const finalTyped = createFinalOutput(key, timestamps);
+        const finalTyped = createFinalOutput(key, timestamp);
         // if the lengths are the same, then it can be treated like a simply mistype
         if (finalTyped.length === target.length) {
-          return dumpNGrams(nGramsInTarget, correct, timestamps);
+          return dumpNGrams(nGramsInTarget, correct, timestamp);
         }
         // if the lengths differ, then there's probably a way we could handle it,
         // but i don't want to implement it right now :-P
@@ -137,10 +132,11 @@ const createNGramsFromWords = (
 };
 
 const computeNGram = (
-  session: DataFrame,
+  session: LongSessionRow[],
   nGramSize: number
 ): nGramFeedback[] => {
   const words = transformIntoWords(session);
+  console.log(words);
   const nGrams = createNGramsFromWords(words, nGramSize);
   return nGrams;
 };

diff --git a/src/analysis/computeRunningWPM.ts b/src/analysis/computeRunningWPM.ts
@@ -1,58 +1,54 @@
-import { DataFrame } from "danfojs";
+import {
+  tidy,
+  filter,
+  mutate,
+  groupBy,
+  summarize,
+  first,
+  last,
+} from "@tidyjs/tidy";
+import { LongSessionRow } from "../types";
 
-const computeRunningWPM = (session: DataFrame): number[] => {
+const computeRunningWPM = (session: LongSessionRow[]): number[] => {
   // filter out incorrect key presses
-  const correctEvents = session.query(
-    session["key"].values.map((k: string) => k !== "Backspace")
+  const correctEvents = tidy(
+    session,
+    filter(({ key }) => key !== "Backspace")
   );
   // if there are not enough correct events, then stop and return -1
-  if (correctEvents.values.length < 5) return [-1];
+  if (correctEvents.length < 5) return [-1];
   // create groups of 5 characters and measure how long each one took
-  const totalRows = correctEvents.shape[0];
-  // combine the last two groups so there's enough data
-  const remainder = totalRows % 5;
-  const wpmGroupLabelsWithoutRemainder = Array.from(Array(totalRows)).map(
-    (_, i) => Math.trunc(i / 5)
+  // to prevent having too small a last group, we'll only take n rows up to a number divisible by 5
+  const lastGroup = Math.trunc(correctEvents.length / 5);
+  let rowNum = 0;
+  const annotatedSession = tidy(
+    correctEvents,
+    mutate({
+      wpmGroup: () => {
+        const group = Math.trunc(rowNum / 5);
+        rowNum += 1;
+        return group;
+      },
+    }),
+    filter(({ wpmGroup }) => wpmGroup < lastGroup)
   );
-  const wpmGroupLabels = [
-    ...wpmGroupLabelsWithoutRemainder.slice(
-      0,
-      wpmGroupLabelsWithoutRemainder.length - remainder
-    ),
-    ...Array.from(Array(remainder)).map(() => Math.trunc(totalRows / 5) - 1),
-  ];
 
-  const annotatedSession = correctEvents.addColumn("wpmGroup", wpmGroupLabels);
-  const maxLabel = wpmGroupLabels.slice(-1)[0];
-  // iterate through row groups and get duration
-  const runningWPMValues = Array.from(new Array(maxLabel)).map(
-    (_, i): number => {
-      // grab the timestamp col for the 5 character group
-      const mask = annotatedSession["wpmGroup"].eq(i);
-      let queryRes;
-      try {
-        queryRes = annotatedSession.query(mask);
-      } catch (err) {
-        console.error("failed to query dataframe!", {
-          err,
-          targetedWPMGroup: i,
-          mask,
-          annotatedSession,
-        });
-        return -1;
-      }
-      const group = queryRes["timestamp"];
-      // compute group duration in seconds
-      const tStart = group.min({ axis: 1 });
-      const tEnd = group.max({ axis: 1 });
-      // this is the duration it took to type 5 characters
-      const duration = (tEnd - tStart) / 60000;
-      // so we can extrapolate that out to 1 minute to get the instantaneous WPM
-      const groupWPM = duration * 6000;
-      return Math.round(groupWPM);
-    }
+  const runningWPMValues = tidy(
+    annotatedSession,
+    groupBy("wpmGroup", [
+      summarize({ start: first("timestamp"), end: last("timestamp") }),
+    ]),
+    mutate({
+      // this is the time it took to type 5 characters ie one word
+      duration: ({ start, end }) => (end - start) / 1000,
+    }),
+    mutate({
+      // so we can extrapolate that out to 60 seconds to get WPM
+      wpm: ({ duration }) => Math.round(duration * 60),
+    })
   );
-  return runningWPMValues;
+
+  return runningWPMValues.map(({ wpm }) => wpm);
 };
 
 export default computeRunningWPM;
diff --git a/src/analysis/computeWPM.ts b/src/analysis/computeWPM.ts
@@ -1,14 +1,15 @@
-import { DataFrame } from "danfojs";
+import { tidy, select } from "@tidyjs/tidy";
+import { LongSessionRow } from "../types";
 
-const computeWPM = (session: DataFrame): number => {
-  const first = session.head(1)["timestamp"];
-  const last = session.tail(1)["timestamp"];
+const computeWPM = (session: LongSessionRow[]): number => {
+  const first = session[0]["timestamp"];
+  const last = session[session.length - 1]["timestamp"];
   // get the total duration of the test
-  const duration = last.sub(first).values[0];
+  const duration = last - first;
   // divide to get minutes
   const durationInMin = duration / 60000;
   // get the number of correct characters typed
-  const chars = session.query(session["correct"]).shape[0];
+  const chars = tidy(session, select("correct")).length;
   // assume an average word is 5 characters
   const words = chars / 5;
   const wpm = words / durationInMin;

diff --git a/src/analysis/sessionPostProcessing.ts b/src/analysis/sessionPostProcessing.ts
@@ -1,4 +1,3 @@
-import * as dfd from "danfojs";
 import { v4 as uuidv4 } from "uuid";
 import { LongSessionRow, Word } from "../types";
 import computeAccuracy from "./computeAccuracy";
@@ -25,14 +24,14 @@ export const dumpSession = async (session: Word[]): Promise<void> => {
 };
 
 // create a dataframe where each typing event takes up one row
-export const pivotSessionLong = (session: Word[]): dfd.DataFrame => {
+export const pivotSessionLong = (session: Word[]): LongSessionRow[] => {
   const rows = session.flatMap(({ target, history }) =>
     history.map((e) => ({
       target,
       ...e,
     }))
   ) as LongSessionRow[];
 
-  const out = new dfd.DataFrame(rows);
+  const out = rows;
   return out;
 };
diff --git a/src/bin/createFinalOutput.ts b/src/bin/createFinalOutput.ts
@@ -0,0 +1,16 @@
+interface FinalOutputItem {
+  key: string;
+  timestamp: number;
+}
+// we need to get the final output i.e. the key events minus the ones that were followed by backspace
+export const createFinalOutput = (
+  keys: string[],
+  timestamp: number[]
+): FinalOutputItem[] => {
+  let out: FinalOutputItem[] = [];
+  keys.forEach((k, i) => {
+    if (k !== "Backspace") out.push({ key: k, timestamp: timestamp[i] });
+    else out = out.slice(0, out.length - 1);
+  });
+  return out;
+};
diff --git a/src/bin/pullColumns.ts b/src/bin/pullColumns.ts
@@ -0,0 +1,12 @@
+// returns array of each column's data as an array
+// assume that all objects contain the same number of keys
+export const pullColumns = (items: Object[], columns: string[]): any[][] => {
+  const colData: { [key: string]: any[] } = {};
+  columns.forEach((c) => (colData[c] = []));
+  items.forEach((i: { [key: string]: any }) =>
+    columns.forEach((c) => {
+      colData[c].push(i[c]);
+    })
+  );
+  return columns.map((c) => colData[c]);
+};
diff --git a/src/components/Accuracy.tsx b/src/components/Accuracy.tsx
@@ -1,7 +1,7 @@
-import { DataFrame } from "danfojs";
 import computeAccuracy from "../analysis/computeAccuracy";
+import { LongSessionRow } from "../types";
 
-const Accuracy = ({ session }: { session: DataFrame }) => {
+const Accuracy = ({ session }: { session: LongSessionRow[] }) => {
   return (
     <div className="analysis-stat">
       Accuracy: {(100 * computeAccuracy(session)).toFixed(0)}%

diff --git a/src/components/NGram.tsx b/src/components/NGram.tsx
@@ -1,7 +1,6 @@
-import { DataFrame } from "danfojs";
-import { Series } from "danfojs/dist/danfojs-base";
+import { tidy, mean, summarize, filter } from "@tidyjs/tidy";
 import computeNGram from "../analysis/computeNGram";
-import { nGramFeedback } from "../types";
+import { LongSessionRow, nGramFeedback } from "../types";
 
 interface ProcessedNGramData {
   nGram: string;
@@ -12,11 +11,14 @@ interface ProcessedNGramData {
 
 const processNGramData = (nGrams: nGramFeedback[]): ProcessedNGramData[] =>
   nGrams.map(({ nGram, performance }) => {
-    const meanDuration = Math.round(
-      new Series(performance.map(({ duration }) => duration)).mean()
-    );
-    const correctData = performance.flatMap(
-      ({ correct }) => correct.indexOf(false) <= -1
+    const meanDuration = tidy(
+      performance,
+      summarize({ meanDuration: mean("duration") })
+    )[0]["meanDuration"] as number;
+
+    const correctData = tidy(
+      performance,
+      filter(({ correct }) => correct.indexOf(false) <= -1)
     );
     const meanAccuracy =
       correctData.filter((e) => e).length / correctData.length;
@@ -42,12 +44,12 @@ const displayNGramData = (
       <tr>
         <th scope="row">{nGram}</th>
         <td>{nGramCount}</td>
-        <td>{meanDuration}ms</td>
+        <td>{Math.round(meanDuration)}ms</td>
         <td>{Math.round(meanAccuracy * 100)}%</td>
       </tr>
     ));
 
-const NGram = ({ session }: { session: DataFrame }) => {
+const NGram = ({ session }: { session: LongSessionRow[] }) => {
   const sortBy: NGramSortables = "nGramCount";
   const nRows = 10;
 

diff --git a/src/components/RunningWPM.tsx b/src/components/RunningWPM.tsx
@@ -1,9 +1,13 @@
 import { ReactElement } from "react";
 import ECharts from "echarts-for-react";
-import { DataFrame } from "danfojs";
 import computeRunningWPM from "../analysis/computeRunningWPM";
+import { LongSessionRow } from "../types";
 
-const RunningWPM = ({ session }: { session: DataFrame }): ReactElement => {
+const RunningWPM = ({
+  session,
+}: {
+  session: LongSessionRow[];
+}): ReactElement => {
   // blah
   const runningWPM = computeRunningWPM(session);
   const options = {

diff --git a/src/components/WPM.tsx b/src/components/WPM.tsx
@@ -1,7 +1,7 @@
-import { DataFrame } from "danfojs";
 import computeWPM from "../analysis/computeWPM";
+import { LongSessionRow } from "../types";
 
-const WPM = ({ session }: { session: DataFrame }) => {
+const WPM = ({ session }: { session: LongSessionRow[] }) => {
   const wpm = computeWPM(session);
   return <div className="analysis-stat">WPM: {wpm}</div>;
 };

diff --git a/src/routes/Analysis.tsx b/src/routes/Analysis.tsx
@@ -1,4 +1,3 @@
-import { DataFrame } from "danfojs";
 import { useEffect, useState } from "react";
 import { Link } from "react-router-dom";
 import { RxDocument } from "rxdb";
@@ -9,7 +8,7 @@ import NGram from "../components/NGram";
 import RunningWPM from "../components/RunningWPM";
 import WPM from "../components/WPM";
 import "../styles/Analysis.scss";
-import { Word } from "../types";
+import { LongSessionRow, Word } from "../types";
 import { SessionType } from "../types/sessionSchema";
 
 const Vis = () => {
@@ -18,7 +17,7 @@ const Vis = () => {
     Function
   ] = useState([]);
   const [selectedSession, setSelectedSession]: [
-    DataFrame | undefined,
+    LongSessionRow[] | undefined,
     Function
   ] = useState();
   const [selectedSessionUUID, setSelectedSessionUUID] = useState("");

diff --git a/src/types/index.ts b/src/types/index.ts
@@ -30,7 +30,7 @@ export interface WordPerformance {
     // array of whether the right key was pressed
     correct: boolean[];
     // array of timestamps so we can know how long the word took to type
-    timestamps: number[];
+    timestamp: number[];
   }[];
 }