diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..32a4ddf --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.html linguist-detectable=false +*.Rmd linguist-language=R \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f4f606b --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +*.Rproj diff --git a/Class Activity 1 Response.Rmd b/Class Activity 1 Response.Rmd new file mode 100644 index 0000000..5176c17 --- /dev/null +++ b/Class Activity 1 Response.Rmd @@ -0,0 +1,100 @@ +--- +title: "Class Activity 1" +Author: Timothy Lee +output: html_document +--- + + +**Load the libraries tidyr and dplyr** + +```{r echo = FALSE, message = FALSE} +library(tidyr) +library(dplyr) +``` + +**Create a data frame from the swirl-data.csv file called DF1** + +```{r echo = FALSE} +DF1 = read.csv("C:/Users/Timothy/Google Drive/TC Stuff/Analytics/HUDK 4050 & 4051 - Learning Analytics/projects/class-activity-1/swirl-data.csv", header = TRUE) +``` + +**The variables are:** + +* **`course_name` - the name of the R course the student attempted** +* **`lesson_name` - the lesson name** +* **`question_number` - the question number attempted correct - whether the question was answered correctly** +* **`attempt` - how many times the student attempted the question** +* **`skipped` - whether the student skipped the question** +* **`datetime` - the date and time the student attempted the question** +* **`hash` - anonymyzed student ID** + +**Create a new data frame that only includes the variables `hash`, `lesson_name` and `attempt` called DF2** + +```{r} +DF2 = DF1 %>% select("hash", "lesson_name", "attempt") +#DF2 = data.frame('Hash' = DF1$hash, 'LessonName' = DF1$lesson_name, 'Attempt' = DF1$attempt) +``` + +**Use the `group_by` function to create a data frame that sums all the attempts for each hash by each lesson_name called DF3** + +Is this correct? +```{r} +DF3 = DF2 %>% filter(!is.na(attempt)) %>% group_by(hash, lesson_name) %>% summarise(sumAttempts = sum(attempt)) +``` + +**On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names** + +Something like: + +| Hash | Basic Building Blocks | Logic | +| ------------- |:-------------------------:| -----:| +| 2864 | 2 | 45 | +| 4807 | 3 | 100 | +| 2864 | 45 | 251 | + + +Convert DF3 to this format. + +```{r} +DF3spread = DF3 %>% spread(key = lesson_name, value = sumAttempts) +``` + +**Create a new data frame from DF1 called DF4 that only includes the variables `hash`, `lesson_name` and `correct`** + +```{r} +DF4 = DF1 %>% select("hash", "lesson_name", "correct") +``` + +**Convert the correct variable so that `TRUE` is coded as the number 1 and `FALSE` is coded as 0** +```{r} +codedCorrect = ifelse(DF4$correct == TRUE, yes = 1, no = 0) +#No need for additional ifelse for NA, since ifelse documentation says "Missing values in test give missing values in the result." +#I checked and this appears to work. +DF4 = cbind(DF4, codedCorrect) +``` + +**Create a new data frame called DF5 that provides a mean score for each student on each course** + +```{r} +DF5 = DF4 %>% filter(!is.na(codedCorrect)) %>% group_by(hash, lesson_name) %>% summarise(meanScore = mean(codedCorrect)) +``` + +**Extra credit Convert the datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each day** + +```{r} +#Select only the relevant variables +DF6 = DF1 %>% select("correct", "datetime") +#Convert score to numbers - 1 for correct, 0 for incorrect +DF6$correct = ifelse(DF6$correct == TRUE, yes = 1, no = 0) +#Change datetime to proper datetime variables +DF6$datetime = as.POSIXct(DF6$datetime, origin = "1970-01-01") +#Omit time so group_by groups by day and not second +DF6$datetime = lubridate::round_date(DF6$datetime, unit = 'day') + +DF6 = DF6 %>% filter(!is.na(correct)) %>% + group_by(datetime) %>% + summarise(averageCorrect = mean(correct)) +DF6 + +``` + diff --git a/Class-Activity-1-Response.html b/Class-Activity-1-Response.html new file mode 100644 index 0000000..ac09697 --- /dev/null +++ b/Class-Activity-1-Response.html @@ -0,0 +1,473 @@ + + + + +
+ + + + + + + + + +title: “Class Activity 1” Author: Timothy Lee output: html_document—
+Load the libraries tidyr and dplyr
+Create a data frame from the swirl-data.csv file called DF1
+The variables are:
+course_name - the name of the R course the student attemptedlesson_name - the lesson namequestion_number - the question number attempted correct - whether the question was answered correctlyattempt - how many times the student attempted the questionskipped - whether the student skipped the questiondatetime - the date and time the student attempted the questionhash - anonymyzed student IDCreate a new data frame that only includes the variables hash, lesson_name and attempt called DF2
DF2 = DF1 %>% select("hash", "lesson_name", "attempt")
+#DF2 = data.frame('Hash' = DF1$hash, 'LessonName' = DF1$lesson_name, 'Attempt' = DF1$attempt)
+Use the group_by function to create a data frame that sums all the attempts for each hash by each lesson_name called DF3
Is this correct?
+DF3 = DF2 %>% filter(!is.na(attempt)) %>% group_by(hash, lesson_name) %>% summarise(sumAttempts = sum(attempt))
+On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names
+Something like:
+| Hash | +Basic Building Blocks | +Logic | +
|---|---|---|
| 2864 | +2 | +45 | +
| 4807 | +3 | +100 | +
| 2864 | +45 | +251 | +
Convert DF3 to this format.
+DF3spread = DF3 %>% spread(key = lesson_name, value = sumAttempts)
+Create a new data frame from DF1 called DF4 that only includes the variables hash, lesson_name and correct
DF4 = DF1 %>% select("hash", "lesson_name", "correct")
+Convert the correct variable so that TRUE is coded as the number 1 and FALSE is coded as 0
DF4coded = DF4 %>% mutate(codedCorrect = as.numeric(correct)) #Why do I need to subtract 2? It codes as 3 and 2, not 1 and 0
+Create a new data frame called DF5 that provides a mean score for each student on each course
+DF5 = DF4coded %>% filter(!is.na(codedCorrect)) %>% group_by(hash, lesson_name) %>% summarise(meanScore = mean(codedCorrect))
+Extra credit Convert the datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each day
+DF6 = DF1 %>% select("hash", "correct", "datetime") %>% mutate(datetime = as.POSIXct(DF1$datetime, origin = "1970-01-01")) %>%filter(!is.na(correct)) %>% group_by(datetime)
+
+
+
+
+