Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 93 additions & 8 deletions class-activity-6.Rmd
Original file line number Diff line number Diff line change
@@ -1,22 +1,73 @@
---
title: 'HUDK4050: Class Activity 6'
author: "Charles Lang"
date: "10/23/2018"
output: html_document
author: "Ningyao Xu"
date: "10/16/2018"
output: pdf_document
---
# Data Management
```{r}
#Load data
DF1 <- read.csv("HUDK405019-clustering.csv", header = TRUE)

library(tidyr)
library(dplyr)
#rownames == First name + Last name
for (i in c(1,2,15,16))
{DF1[,i] = as.character(DF1[,i])}
DF1$name <- paste(DF1$First.Name, DF1$Last.Name)
rownames(DF1) <- DF1$name
DF1 <- DF1[,3:16]


#Delete those who write latitude and longtitude twice in the survey
list <- NULL
for (i in 1:nrow(DF1))
{ if (DF1[i,13] == DF1[i,14] )
list <- c(list,i)}
DF1 <- DF1[-list,]


# reverse those who put latitude and longtitude in wrong order
a <-NULL
b<- NULL
reverse <- grep("E",DF1[,13])
for (i in reverse)
{ a = DF1[i,13]
b = DF1[i,14]
DF1[i,13] <- b
DF1[i,14] <- a}

#Find the signal and delete all the things after the signal
#"° is how my DELL shows "°", I have no idea why it shows this way
for (j in c(13:14)){
for (i in 1:nrow(DF1))
{ if (grepl("°", DF1[i,j]) )
{ psn <- as.numeric(regexpr("°", DF1[i,j]))
DF1[i,j] <- substr(DF1[i,j], 1, psn-1)}}}

#If you are using mac, use the following one
for (j in c(13:14)){
for (i in 1:nrow(DF1))
{ if (grepl("\\D", DF1[i,j]) )
{ psn <- as.numeric(regexpr("\\D", DF1[i,j]))
DF1[i,j] <- substr(DF1[i,j], 1, psn-1)}}}

#Delete all the space, alphabet from the data and turn all the data into numeric
for (i in c(1:11,13,14))
{ DF1[,i]= gsub("[[:alpha:]]", "", DF1[,i])
DF1[,i]= gsub(" ", "", DF1[,i])
DF1[,i] = as.numeric(DF1[,i])}
# Omit all the NAs from the data
DF1 <- na.omit(DF1)

DF2 <- data.frame(select_if(DF1,is.numeric))
#Convert the index numbers of the data fram into the student names.

#Wrangle data using dplyr to include only the numerical values.

#Scale the data so that no variable has undue influence

DF2 <- scale(DF2)

```

# Find lattitudes & longitudes for cities
Expand Down Expand Up @@ -46,32 +97,66 @@ Also, we need to choose the number of clusters we think are in the data. We will

```{r}

fit <- kmeans(DF2, 1)
fit <- kmeans(DF2, 4)

#We have created an object called "fit" that contains all the details of our clustering including which observations belong to each cluster.

#We can access the list of clusters by typing "fit$cluster", the top row corresponds to the original order the rows were in. Notice we have deleted some rows.

fit$cluster yay
fit$cluster

#We can also attach these clusters to te original dataframe by using the "data.frame" command to create a new data frame called K4.

DF3 <- data.frame(DF2, fit$cluster)

#Have a look at the DF3 dataframe. Lets change the names of the variables to make it more convenient with the names() command.

names(DF3) <- c("1", "2", "3", "4", "5", "cluster") #c() stands for concatonate and it creates a vector of anything, in this case a vector of names.

```

# Visualize your clusters in ggplot
```{r}
#Create a scatterplot that plots location of each student and colors the points according to their cluster
DF4 <- data.frame(DF1[,13],DF1[,14],fit$cluster)
names(DF4) <- c("latitude", "longtitude","cluster")
attach(DF4)
library(ggplot2)
ggplot(DF4, aes(x = longtitude, y =latitude, pch = factor(cluster))) +
geom_point(aes(color = factor(cluster)))

```

# Can you group students from the classes data set in Assignment 2 using K-modes?

```{r}

DT1 <- read.csv("hudk4050-classes.csv",header = TRUE)
DT1$Name <- paste(DT1$First.Name, DT1$Last.Name)
DT2_dirty <- DT1[,3:9]
DT3 <- DT2_dirty %>% gather(classnum, classcode, `Class.1`, `Class.2`, `Class.3`, `Class.4`, `Class.5`, `Class.6`) %>% select(-c(classnum))
DT3$classcode = gsub(" ", "", DT3$classcode)
DT3 <- DT3 %>% filter(classcode != "HUDK4050") %>% filter(Name != "ZIMO CHEN")
DT3$Count = 1
DT3 <- DT3[which(DT3$classcode != ""),]
DT4 <- DT3 %>% spread(classcode,Count)
row.names(DT4) = DT4$Name
DT4$Name <- NULL
DT4 = ifelse(is.na(DT4), 0, 1)
DT5 = as.matrix(DT4)
DT5 <- scale(DT5)
library(MASS)
set.seed(123)
pca=princomp(DT5[,1:50],cor=T)
screeplot(pca,type="line",lwd=2)
#According to this plot, maybe we should try cluster 6 groups.
set.seed(123)
fit2 <- kmeans(DT5,6)
fit2$size
cluster <- data.frame(fit2$cluster)
colnames(cluster) <- c("cluster")
cluster

# Just to check, I am in the Applied Statistics program and people in the group 5 are exactly those in the Applied statistics program with me!

```

Loading