diff --git a/class-activity-6.Rmd b/class-activity-6.Rmd index d96c635..981cdf5 100644 --- a/class-activity-6.Rmd +++ b/class-activity-6.Rmd @@ -7,12 +7,47 @@ output: html_document # Data Management ```{r} #Load data -DF1 <- read.csv("HUDK405019-clustering.csv", header = TRUE) +DF1 <- read.csv("HUDK405019-clustering.csv", header = TRUE,stringsAsFactors = FALSE) + +#Convert the index numbers of the data frame into the student names. +install.packages("tidyverse") +library(dplyr) +library(tidyr) +DF1<-unite(DF1,name,"First.Name","Last.Name",sep=" ") +row.names(DF1)<-DF1$name +colnames(DF1)<-c(1:15) -#Convert the index numbers of the data fram into the student names. - #Wrangle data using dplyr to include only the numerical values. +DF2<-select(DF1,c(2:12)) #rename + +DF2<-data.frame(lapply(DF2, gsub, pattern="\\D", replacement="")) #convert non digit to nothing +DF2<-lapply(DF2, as.character) #need to convert to character first before numeric + +#levels(DF3)[levels(DF3)==""] <- NA +#DF3[which(DF3=="")]<-0 + +#DF2<-lapply(DF2, as.numeric) + +for (i in 1:11){ + a<-length(DF2[[i]]) + for (j in 1:a){ + if (DF2[[i]][j]==""){ + DF2[[i]][j]<-0 + b<-mean(as.numeric(DF2[[i]])) + DF2[[i]][j]<-b + } + } +} + +#DF2%>%mutate_all(as.character)%>%mutate_all(as.numeric) +DF2<-data.frame(DF2) +indx <- sapply(DF2, is.factor) +DF2[indx] <- lapply(DF2[indx], function(x) as.numeric(as.character(x))) + +colnames(DF2)<-c("month live in NYC","# siblings","# sport each week","distance to TC","#friend on phone","# movies","#pet own","#people met this year","#cook","#class taking","#states traveled") + +#numberic??????? easy way to convert whole graph to numeric #Scale the data so that no variable has undue influence DF2 <- scale(DF2) @@ -46,32 +81,91 @@ Also, we need to choose the number of clusters we think are in the data. We will ```{r} -fit <- kmeans(DF2, 1) +fit <- kmeans(DF2, 4) #We have created an object called "fit" that contains all the details of our clustering including which observations belong to each cluster. #We can access the list of clusters by typing "fit$cluster", the top row corresponds to the original order the rows were in. Notice we have deleted some rows. -fit$cluster yay +K4<-fit$cluster -#We can also attach these clusters to te original dataframe by using the "data.frame" command to create a new data frame called K4. +#We can also attach these clusters to tje original dataframe by using the "data.frame" command to create a new data frame called K4. -DF3 <- data.frame(DF2, fit$cluster) +DF3 <- data.frame(DF2,K4) #Have a look at the DF3 dataframe. Lets change the names of the variables to make it more convenient with the names() command. -names(DF3) <- c("1", "2", "3", "4", "5", "cluster") #c() stands for concatonate and it creates a vector of anything, in this case a vector of names. +names(DF3) <- c("1", "2", "3", "4", "5","6","7","8","9","10","11","cluster") +#c() stands for concatonate and it creates a vector of anything, in this case a vector of names. ``` # Visualize your clusters in ggplot ```{r} #Create a scatterplot that plots location of each student and colors the points according to their cluster +install.packages("ggplot2") +library(ggplot2) +DF4<-DF1[14:15] +#DF4<-data.frame(lapply(DF4,gsub,pattern="[[:alpha:]]",replacement="")) +DF5<-data.frame(lapply(DF4,gsub,pattern="[[:alpha:]]",replacement="")) + + +#install.packages("stringr") +#library(stringr) +#DF6[,1]<-data.frame(lapply(DF5[,1],gsub,pattern=",.*",replacement="")) gsub convert everything to list, could not be placed in the dataframe. So if we want to change the specific cell, could devide the Landtitute and Longtitute to dataframe and then combine +#C<-lapply(DF4[1,1],gsub,pattern=",.*",replacement="") +#AA<-C[[1]][1] +#DF5[[1]][1]<-AA +#DF6<-data.frame(DF5) +#D<-lapply(DF4[1,2],gsub,pattern=".*,",replacement="") + +DF6<-data.frame(lapply(DF4,gsub,pattern="°.*",replacement="")) +DF6<-DF6[-10,] +DF7<-data.frame(lapply(DF6,gsub,pattern="[[:alpha:]]",replacement="")) +K4<-c(K4) +K4new<-K4[-10] +DF8<-data.frame(DF7,K4new) +colnames(DF8)<-c("latitute","longtitute","cluster") +#indx <- sapply(DF8, is.factor) +#DF8[indx] <- lapply(DF8[indx], function(x) as.numeric(as.character(x))) This fuction work well +DF8[,1:2]<-lapply(DF8[,1:2],function(x)as.numeric(as.character(x))) +#DF8<-as.numeric(DF8) +#plot<-ggplot(DF8)+geom_point(aes(x=latitute,y=longtitute,color=DF8$cluster)) +pp<-ggplot(DF8,aes(x=latitute,y=longtitute,pch=factor(cluster)))+geom_point(aes(color=factor(cluster))) +#+scale_y_continuous(breaks=seq(0, 150, 10)) +scale_x_continuous(breaks=seq(0, 150, 10)) +#ppp<-pp + scale_x_discrete(breaks=seq(0, 50, 5))+scale_y_discrete(breaks=seq(-100, 120, 20)) + + ``` # Can you group students from the classes data set in Assignment 2 using K-modes? ```{r} - +hudk4050.classes <- read.csv("~/Desktop/hudk4050/assignment2/hudk4050-classes.txt", header=FALSE, stringsAsFactors=FALSE) +installed.packages("tidyverse") +library(dplyr) +library(tidyr) +installed.packages("igraph") +library(igraph) +data<-select(hudk4050.classes,V20,V21,V22,V23,V24,V25,V26) +data <- data.frame(lapply(data, function(x) { + gsub(" ", "", x) + })) +data<-data[-c(1,2,3),] +Name<-data[,1] +#data[,1]<-c(1:53) +colnames(data)<-c("Name","Class 1","Class 2","Class 3","Class 4","Class 5","Class 6") +row.names(data)<-Name +data<-data.frame(lapply(data,gsub,pattern="@.*",replacement="")) +install.packages("klaR") +library(klaR) +kmode<-kmodes(data,4,iter.max=10,weighted=FALSE) +#datatran<-gather(data,"Class Number","ClassName",2:7) +#datatran<-datatran[,-2] #remove col class number +#datatran<-filter(datatran,ClassName> 0,ClassName!="HUDK4050") +#datatran$count<-1 +#dataaa<-spread(datatran,ClassName,count,fill=0) +#row.names(dataaa)<-Name +#dataaa<-dataaa[,-1] ```