diff --git a/PROJ3_WIN/565Rasterizer.sdf b/PROJ3_WIN/565Rasterizer.sdf
new file mode 100644
index 0000000..b693313
Binary files /dev/null and b/PROJ3_WIN/565Rasterizer.sdf differ
diff --git a/PROJ3_WIN/565Rasterizer/565Rasterizer.vcxproj b/PROJ3_WIN/565Rasterizer/565Rasterizer.vcxproj
index 1077f39..a072658 100755
--- a/PROJ3_WIN/565Rasterizer/565Rasterizer.vcxproj
+++ b/PROJ3_WIN/565Rasterizer/565Rasterizer.vcxproj
@@ -86,6 +86,7 @@
     <CudaCompile>
       <CompileOut>$(ProjectDir)$(Platform)/$(Configuration)/%(Filename)%(Extension).obj</CompileOut>
       <Include>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.0\include;C:/ProgramData/NVIDIA Corporation/NVIDIA GPU Computing SDK 4.0/C/common/inc;../shared/glew/includes;../shared/freeglut/includes</Include>
+      <CodeGeneration>compute_20,sm_20</CodeGeneration>
     </CudaCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -111,6 +112,7 @@
     <CudaCompile>
       <CompileOut>$(ProjectDir)$(Platform)/$(Configuration)/%(Filename)%(Extension).obj</CompileOut>
       <Include>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.0\include;C:/ProgramData/NVIDIA Corporation/NVIDIA GPU Computing SDK 4.0/C/common/inc;../shared/glew/includes;../shared/freeglut/includes</Include>
+      <CodeGeneration>compute_20,sm_20</CodeGeneration>
     </CudaCompile>
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/PROJ3_WIN/565Rasterizer/565Rasterizer.vcxproj.gundeep.nvuser b/PROJ3_WIN/565Rasterizer/565Rasterizer.vcxproj.gundeep.nvuser
new file mode 100644
index 0000000..2fa0bb0
--- /dev/null
+++ b/PROJ3_WIN/565Rasterizer/565Rasterizer.vcxproj.gundeep.nvuser
@@ -0,0 +1,5 @@
+﻿<ProjectSettingsModel DefinitionId="a218e900-1199-4ab1-a767-7976786f04d4" DisplayName="Nexus Project User Settings" xmlns="clr-namespace:Ark.PropertyModel;assembly=Ark">
+    <SettingsPointModel DefinitionId="3eb7ba04-016d-475e-b0ff-daa5f0e59a08" DisplayName="Launch">
+        <Property Name="Arguments" Value="mesh=&quot;../../objs/cow.obj&quot;" />
+    </SettingsPointModel>
+</ProjectSettingsModel>
\ No newline at end of file
diff --git a/PROJ3_WIN/Readme Rasterizer.pdf b/PROJ3_WIN/Readme Rasterizer.pdf
new file mode 100644
index 0000000..b53db11
Binary files /dev/null and b/PROJ3_WIN/Readme Rasterizer.pdf differ
diff --git a/objs/cube.obj b/objs/cube.obj
new file mode 100644
index 0000000..f696427
--- /dev/null
+++ b/objs/cube.obj
@@ -0,0 +1,34 @@
+# cube.obj
+#
+ 
+g cube
+ 
+v 0.0 0.0 0.0
+v 0.0 0.0 1.0
+v 0.0 1.0 0.0
+v 0.0 1.0 1.0
+v 1.0 0.0 0.0
+v 1.0 0.0 1.0
+v 1.0 1.0 0.0
+v 1.0 1.0 1.0
+
+vn 0.0 0.0 1.0
+vn 0.0 0.0 -1.0
+vn 0.0 1.0 0.0
+vn 0.0 -1.0 0.0
+vn 1.0 0.0 0.0
+vn -1.0 0.0 0.0
+ 
+f 1//2 7//2 5//2
+f 1//2 3//2 7//2 
+f 1//6 4//6 3//6 
+f 1//6 2//6 4//6 
+f 3//3 8//3 7//3 
+f 3//3 4//3 8//3 
+f 5//5 7//5 8//5 
+f 5//5 8//5 6//5 
+f 1//4 5//4 6//4 
+f 1//4 6//4 2//4 
+f 2//1 6//1 8//1 
+f 2//1 8//1 4//1 
+ 
diff --git a/objs/tri.obj b/objs/tri.obj
new file mode 100644
index 0000000..5085e34
--- /dev/null
+++ b/objs/tri.obj
@@ -0,0 +1,23 @@
+# cube.obj
+#
+ 
+g cube
+ 
+v 0.0 0.0 0.0
+v 0.0 0.0 1.0
+v 0.0 1.0 0.0
+v 0.0 1.0 1.0
+v 1.0 0.0 0.0
+v 1.0 0.0 1.0
+v 1.0 1.0 0.0
+v 1.0 1.0 1.0
+
+vn 0.0 0.0 1.0
+vn 0.0 0.0 -1.0
+vn 0.0 1.0 0.0
+vn 0.0 -1.0 0.0
+vn 1.0 0.0 0.0
+vn -1.0 0.0 0.0
+ 
+f 2//1 8//1 4//1 
+ 
diff --git a/src/ObjCore/obj.cpp b/src/ObjCore/obj.cpp
index e748574..a7f6e6a 100755
--- a/src/ObjCore/obj.cpp
+++ b/src/ObjCore/obj.cpp
@@ -9,7 +9,7 @@
 
 using namespace std;
 
-obj::obj(){
+obj::obj(){					// constructor
 	vbosize = 0;
 	nbosize = 0;
 	cbosize = 0;
@@ -22,7 +22,7 @@ obj::obj(){
 	
 }
 
-obj::~obj(){
+obj::~obj(){				//destructor
 	/*delete vbo;
 	delete nbo;
 	delete cbo;
@@ -37,7 +37,7 @@ obj::~obj(){
 
 void obj::buildVBOs(){
 	recenter();
-	vector<float> VBOvec;
+	vector<float> VBOvec;   // vertex buffer object
 	vector<float> NBOvec;
 	vector<int> IBOvec;
 	int index = 0;
diff --git a/src/main.cpp b/src/main.cpp
index dfb689a..47e77bf 100755
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,354 +1,372 @@
-// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
-// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
-
-#include "main.h"
-
-//-------------------------------
-//-------------MAIN--------------
-//-------------------------------
-
-int main(int argc, char** argv){
-
-  bool loadedScene = false;
-  for(int i=1; i<argc; i++){
-    string header; string data;
-    istringstream liness(argv[i]);
-    getline(liness, header, '='); getline(liness, data, '=');
-    if(strcmp(header.c_str(), "mesh")==0){
-      //renderScene = new scene(data);
-      mesh = new obj();
-      objLoader* loader = new objLoader(data, mesh);
-      mesh->buildVBOs();
-      delete loader;
-      loadedScene = true;
-    }
-  }
-
-  if(!loadedScene){
-    cout << "Usage: mesh=[obj file]" << endl;
-    return 0;
-  }
-
-  frame = 0;
-  seconds = time (NULL);
-  fpstracker = 0;
-
-  // Launch CUDA/GL
-  #ifdef __APPLE__
-  // Needed in OSX to force use of OpenGL3.2 
-  glfwOpenWindowHint(GLFW_OPENGL_VERSION_MAJOR, 3);
-  glfwOpenWindowHint(GLFW_OPENGL_VERSION_MINOR, 2);
-  glfwOpenWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
-  glfwOpenWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
-  init();
-  #else
-  init(argc, argv);
-  #endif
-
-  initCuda();
-
-  initVAO();
-  initTextures();
-
-  GLuint passthroughProgram;
-  passthroughProgram = initShader("shaders/passthroughVS.glsl", "shaders/passthroughFS.glsl");
-
-  glUseProgram(passthroughProgram);
-  glActiveTexture(GL_TEXTURE0);
-
-  #ifdef __APPLE__
-    // send into GLFW main loop
-    while(1){
-      display();
-      if (glfwGetKey(GLFW_KEY_ESC) == GLFW_PRESS || !glfwGetWindowParam( GLFW_OPENED )){
-          kernelCleanup();
-          cudaDeviceReset(); 
-          exit(0);
-      }
-    }
-
-    glfwTerminate();
-  #else
-    glutDisplayFunc(display);
-    glutKeyboardFunc(keyboard);
-
-    glutMainLoop();
-  #endif
-  kernelCleanup();
-  return 0;
-}
-
-//-------------------------------
-//---------RUNTIME STUFF---------
-//-------------------------------
-
-void runCuda(){
-  // Map OpenGL buffer object for writing from CUDA on a single GPU
-  // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
-  dptr=NULL;
-
-  vbo = mesh->getVBO();
-  vbosize = mesh->getVBOsize();
-
-  float newcbo[] = {0.0, 1.0, 0.0, 
-                    0.0, 0.0, 1.0, 
-                    1.0, 0.0, 0.0};
-  cbo = newcbo;
-  cbosize = 9;
-
-  ibo = mesh->getIBO();
-  ibosize = mesh->getIBOsize();
-
-  cudaGLMapBufferObject((void**)&dptr, pbo);
-  cudaRasterizeCore(dptr, glm::vec2(width, height), frame, vbo, vbosize, cbo, cbosize, ibo, ibosize);
-  cudaGLUnmapBufferObject(pbo);
-
-  vbo = NULL;
-  cbo = NULL;
-  ibo = NULL;
-
-  frame++;
-  fpstracker++;
-
-}
-
-#ifdef __APPLE__
-
-  void display(){
-      runCuda();
-      time_t seconds2 = time (NULL);
-
-      if(seconds2-seconds >= 1){
-
-        fps = fpstracker/(seconds2-seconds);
-        fpstracker = 0;
-        seconds = seconds2;
-
-      }
-
-      string title = "CIS565 Rasterizer | "+ utilityCore::convertIntToString((int)fps) + "FPS";
-
-      glfwSetWindowTitle(title.c_str());
-
-
-      glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
-      glBindTexture(GL_TEXTURE_2D, displayImage);
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
-            GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-
-
-      glClear(GL_COLOR_BUFFER_BIT);   
-
-      // VAO, shader program, and texture already bound
-      glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
-
-      glfwSwapBuffers();
-  }
-
-#else
-
-  void display(){
-    runCuda();
-	time_t seconds2 = time (NULL);
-
-    if(seconds2-seconds >= 1){
-
-      fps = fpstracker/(seconds2-seconds);
-      fpstracker = 0;
-      seconds = seconds2;
-
-    }
-
-    string title = "CIS565 Rasterizer | "+ utilityCore::convertIntToString((int)fps) + "FPS";
-    glutSetWindowTitle(title.c_str());
-
-    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
-    glBindTexture(GL_TEXTURE_2D, displayImage);
-    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
-        GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-
-    glClear(GL_COLOR_BUFFER_BIT);   
-
-    // VAO, shader program, and texture already bound
-    glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
-
-    glutPostRedisplay();
-    glutSwapBuffers();
-  }
-
-  void keyboard(unsigned char key, int x, int y)
-  {
-    switch (key) 
-    {
-       case(27):
-         shut_down(1);    
-         break;
-    }
-  }
-
-#endif
-  
-//-------------------------------
-//----------SETUP STUFF----------
-//-------------------------------
-
-#ifdef __APPLE__
-  void init(){
-
-    if (glfwInit() != GL_TRUE){
-      shut_down(1);      
-    }
-
-    // 16 bit color, no depth, alpha or stencil buffers, windowed
-    if (glfwOpenWindow(width, height, 5, 6, 5, 0, 0, 0, GLFW_WINDOW) != GL_TRUE){
-      shut_down(1);
-    }
-
-    // Set up vertex array object, texture stuff
-    initVAO();
-    initTextures();
-  }
-#else
-  void init(int argc, char* argv[]){
-    glutInit(&argc, argv);
-    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
-    glutInitWindowSize(width, height);
-    glutCreateWindow("CIS565 Rasterizer");
-
-    // Init GLEW
-    glewInit();
-    GLenum err = glewInit();
-    if (GLEW_OK != err)
-    {
-      /* Problem: glewInit failed, something is seriously wrong. */
-      std::cout << "glewInit failed, aborting." << std::endl;
-      exit (1);
-    }
-
-    initVAO();
-    initTextures();
-  }
-#endif
-
-void initPBO(GLuint* pbo){
-  if (pbo) {
-    // set up vertex data parameter
-    int num_texels = width*height;
-    int num_values = num_texels * 4;
-    int size_tex_data = sizeof(GLubyte) * num_values;
-    
-    // Generate a buffer ID called a PBO (Pixel Buffer Object)
-    glGenBuffers(1,pbo);
-    // Make this the current UNPACK buffer (OpenGL is state-based)
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
-    // Allocate data for the buffer. 4-channel 8-bit image
-    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
-    cudaGLRegisterBufferObject( *pbo );
-  }
-}
-
-void initCuda(){
-  // Use device with highest Gflops/s
-  cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
-
-  initPBO(&pbo);
-
-  // Clean up on program exit
-  atexit(cleanupCuda);
-
-  runCuda();
-}
-
-void initTextures(){
-    glGenTextures(1,&displayImage);
-    glBindTexture(GL_TEXTURE_2D, displayImage);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
-        GL_UNSIGNED_BYTE, NULL);
-}
-
-void initVAO(void){
-    GLfloat vertices[] =
-    { 
-        -1.0f, -1.0f, 
-         1.0f, -1.0f, 
-         1.0f,  1.0f, 
-        -1.0f,  1.0f, 
-    };
-
-    GLfloat texcoords[] = 
-    { 
-        1.0f, 1.0f,
-        0.0f, 1.0f,
-        0.0f, 0.0f,
-        1.0f, 0.0f
-    };
-
-    GLushort indices[] = { 0, 1, 3, 3, 1, 2 };
-
-    GLuint vertexBufferObjID[3];
-    glGenBuffers(3, vertexBufferObjID);
-    
-    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0); 
-    glEnableVertexAttribArray(positionLocation);
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[1]);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(texcoords), texcoords, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
-    glEnableVertexAttribArray(texcoordsLocation);
-
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBufferObjID[2]);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
-}
-
-GLuint initShader(const char *vertexShaderPath, const char *fragmentShaderPath){
-    GLuint program = glslUtility::createProgram(vertexShaderPath, fragmentShaderPath, attributeLocations, 2);
-    GLint location;
-
-    glUseProgram(program);
-    
-    if ((location = glGetUniformLocation(program, "u_image")) != -1)
-    {
-        glUniform1i(location, 0);
-    }
-
-    return program;
-}
-
-//-------------------------------
-//---------CLEANUP STUFF---------
-//-------------------------------
-
-void cleanupCuda(){
-  if(pbo) deletePBO(&pbo);
-  if(displayImage) deleteTexture(&displayImage);
-}
-
-void deletePBO(GLuint* pbo){
-  if (pbo) {
-    // unregister this buffer object with CUDA
-    cudaGLUnregisterBufferObject(*pbo);
-    
-    glBindBuffer(GL_ARRAY_BUFFER, *pbo);
-    glDeleteBuffers(1, pbo);
-    
-    *pbo = (GLuint)NULL;
-  }
-}
-
-void deleteTexture(GLuint* tex){
-    glDeleteTextures(1, tex);
-    *tex = (GLuint)NULL;
-}
- 
-void shut_down(int return_code){
-  kernelCleanup();
-  cudaDeviceReset();
-  #ifdef __APPLE__
-  glfwTerminate();
-  #endif
-  exit(return_code);
-}
+// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
+// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
+
+#include "main.h"
+
+//-------------------------------
+//-------------MAIN--------------
+//-------------------------------
+
+int main(int argc, char** argv){
+
+  bool loadedScene = false;
+  for(int i=1; i<argc; i++){
+    string header; string data;
+    istringstream liness(argv[i]);
+    getline(liness, header, '='); getline(liness, data, '=');
+    if(strcmp(header.c_str(), "mesh")==0){
+      //renderScene = new scene(data);
+      mesh = new obj();
+      objLoader* loader = new objLoader(data, mesh);
+      mesh->buildVBOs();
+
+      delete loader;
+      loadedScene = true;
+    }
+  }
+
+  if(!loadedScene){
+    cout << "Usage: mesh=[obj file]" << endl;
+    return 0;
+  }
+
+  frame = 0;
+  seconds = time (NULL);
+  fpstracker = 0;
+
+  // Launch CUDA/GL
+  #ifdef __APPLE__
+  // Needed in OSX to force use of OpenGL3.2 
+  glfwOpenWindowHint(GLFW_OPENGL_VERSION_MAJOR, 3);
+  glfwOpenWindowHint(GLFW_OPENGL_VERSION_MINOR, 2);
+  glfwOpenWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
+  glfwOpenWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+  init();
+  #else
+  init(argc, argv);
+  #endif
+
+  initCuda();
+
+  initVAO();
+  initTextures();
+
+  GLuint passthroughProgram;
+  passthroughProgram = initShader("shaders/passthroughVS.glsl", "shaders/passthroughFS.glsl");
+
+  glUseProgram(passthroughProgram);
+  glActiveTexture(GL_TEXTURE0);
+
+  #ifdef __APPLE__
+    // send into GLFW main loop
+    while(1){
+      display();
+      if (glfwGetKey(GLFW_KEY_ESC) == GLFW_PRESS || !glfwGetWindowParam( GLFW_OPENED )){
+          kernelCleanup();
+          cudaDeviceReset(); 
+          exit(0);
+      }
+    }
+
+    glfwTerminate();
+  #else
+    glutDisplayFunc(display);
+    glutKeyboardFunc(keyboard);
+
+    glutMainLoop();
+  #endif
+  kernelCleanup();
+  return 0;
+}
+
+//-------------------------------
+//---------RUNTIME STUFF---------
+//-------------------------------
+
+void runCuda(){
+  // Map OpenGL buffer object for writing from CUDA on a single GPU
+  // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
+  dptr=NULL;
+
+  vbo = mesh->getVBO();
+  vbosize = mesh->getVBOsize();
+
+  nbo = mesh->getNBO();
+  nbosize= mesh->getNBOsize();
+  
+    /*for( int i=0; i<52238;i++)
+	  {
+		printf("vbo %i= %f  \n", i,vbo[i]);
+	  }
+	printf("nbosize %i",nbosize);
+	int y;
+	cin>>y;*/
+
+  float newcbo[] = {0.0, 1.0, 0.0, 
+                    0.0, 0.0, 1.0, 
+                    1.0, 0.0, 0.0};
+  cbo = newcbo;
+  cbosize = 9;
+
+  ibo = mesh->getIBO();
+  ibosize = mesh->getIBOsize();
+
+
+  glm::vec3 lightcol=glm::vec3(1.0,1.0,1.0);
+  glm::vec3 lightpos=glm::vec3(0,10.0,0.0);
+
+  cudaGLMapBufferObject((void**)&dptr, pbo);
+  cudaRasterizeCore(dptr, glm::vec2(width, height),frame, vbo,  vbosize,  cbo,  cbosize,  ibo,  ibosize, nbosize,nbo,lightpos,lightcol);
+  cudaGLUnmapBufferObject(pbo);
+
+  vbo = NULL;
+  cbo = NULL;
+  ibo = NULL;
+  nbo = NULL;
+
+  frame++;
+  fpstracker++;
+  int x;
+ // cin>>x;
+}
+
+#ifdef __APPLE__
+
+  void display(){
+      runCuda();
+      time_t seconds2 = time (NULL);
+
+      if(seconds2-seconds >= 1){
+
+        fps = fpstracker/(seconds2-seconds);
+        fpstracker = 0;
+        seconds = seconds2;
+
+      }
+
+      string title = "CIS565 Rasterizer | "+ utilityCore::convertIntToString((int)fps) + "FPS";
+
+      glfwSetWindowTitle(title.c_str());
+
+
+      glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
+      glBindTexture(GL_TEXTURE_2D, displayImage);
+      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
+            GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+
+
+      glClear(GL_COLOR_BUFFER_BIT);   
+
+      // VAO, shader program, and texture already bound
+      glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
+
+      glfwSwapBuffers();
+  }
+
+#else
+
+  void display(){
+    runCuda();
+	time_t seconds2 = time (NULL);
+
+    if(seconds2-seconds >= 1){
+
+      fps = fpstracker/(seconds2-seconds);
+      fpstracker = 0;
+      seconds = seconds2;
+
+    }
+
+    string title = "CIS565 Rasterizer | "+ utilityCore::convertIntToString((int)fps) + "FPS";
+    glutSetWindowTitle(title.c_str());
+
+    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
+    glBindTexture(GL_TEXTURE_2D, displayImage);
+    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, 
+        GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+
+    glClear(GL_COLOR_BUFFER_BIT);   
+
+    // VAO, shader program, and texture already bound
+    glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
+
+    glutPostRedisplay();
+    glutSwapBuffers();
+  }
+
+  void keyboard(unsigned char key, int x, int y)
+  {
+    switch (key) 
+    {
+       case(27):
+         shut_down(1);    
+         break;
+    }
+  }
+
+#endif
+  
+//-------------------------------
+//----------SETUP STUFF----------
+//-------------------------------
+
+#ifdef __APPLE__
+  void init(){
+
+    if (glfwInit() != GL_TRUE){
+      shut_down(1);      
+    }
+
+    // 16 bit color, no depth, alpha or stencil buffers, windowed
+    if (glfwOpenWindow(width, height, 5, 6, 5, 0, 0, 0, GLFW_WINDOW) != GL_TRUE){
+      shut_down(1);
+    }
+
+    // Set up vertex array object, texture stuff
+    initVAO();
+    initTextures();
+  }
+#else
+  void init(int argc, char* argv[]){
+    glutInit(&argc, argv);
+    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
+    glutInitWindowSize(width, height);
+    glutCreateWindow("CIS565 Rasterizer");
+
+    // Init GLEW
+    glewInit();
+    GLenum err = glewInit();
+    if (GLEW_OK != err)
+    {
+      /* Problem: glewInit failed, something is seriously wrong. */
+      std::cout << "glewInit failed, aborting." << std::endl;
+      exit (1);
+    }
+
+    initVAO();
+    initTextures();
+  }
+#endif
+
+void initPBO(GLuint* pbo){
+  if (pbo) {
+    // set up vertex data parameter
+    int num_texels = width*height;
+    int num_values = num_texels * 4;
+    int size_tex_data = sizeof(GLubyte) * num_values;
+    
+    // Generate a buffer ID called a PBO (Pixel Buffer Object)
+    glGenBuffers(1,pbo);
+    // Make this the current UNPACK buffer (OpenGL is state-based)
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, *pbo);
+    // Allocate data for the buffer. 4-channel 8-bit image
+    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
+    cudaGLRegisterBufferObject( *pbo );
+  }
+}
+
+void initCuda(){
+  // Use device with highest Gflops/s
+  cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
+
+  initPBO(&pbo);
+
+  // Clean up on program exit
+  atexit(cleanupCuda);
+
+  runCuda();
+}
+
+void initTextures(){
+    glGenTextures(1,&displayImage);
+    glBindTexture(GL_TEXTURE_2D, displayImage);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
+        GL_UNSIGNED_BYTE, NULL);
+}
+
+void initVAO(void){
+    GLfloat vertices[] =
+    { 
+        -1.0f, -1.0f, 
+         1.0f, -1.0f, 
+         1.0f,  1.0f, 
+        -1.0f,  1.0f, 
+    };
+
+    GLfloat texcoords[] = 
+    { 
+        1.0f, 1.0f,
+        0.0f, 1.0f,
+        0.0f, 0.0f,
+        1.0f, 0.0f
+    };
+
+    GLushort indices[] = { 0, 1, 3, 3, 1, 2 };
+
+    GLuint vertexBufferObjID[3];
+    glGenBuffers(3, vertexBufferObjID);
+    
+    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]);
+    glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
+    glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0); 
+    glEnableVertexAttribArray(positionLocation);
+
+    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[1]);
+    glBufferData(GL_ARRAY_BUFFER, sizeof(texcoords), texcoords, GL_STATIC_DRAW);
+    glVertexAttribPointer((GLuint)texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
+    glEnableVertexAttribArray(texcoordsLocation);
+
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBufferObjID[2]);
+    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
+}
+
+GLuint initShader(const char *vertexShaderPath, const char *fragmentShaderPath){
+    GLuint program = glslUtility::createProgram(vertexShaderPath, fragmentShaderPath, attributeLocations, 2);
+    GLint location;
+
+    glUseProgram(program);
+    
+    if ((location = glGetUniformLocation(program, "u_image")) != -1)
+    {
+        glUniform1i(location, 0);
+    }
+
+    return program;
+}
+
+//-------------------------------
+//---------CLEANUP STUFF---------
+//-------------------------------
+
+void cleanupCuda(){
+  if(pbo) deletePBO(&pbo);
+  if(displayImage) deleteTexture(&displayImage);
+}
+
+void deletePBO(GLuint* pbo){
+  if (pbo) {
+    // unregister this buffer object with CUDA
+    cudaGLUnregisterBufferObject(*pbo);
+    
+    glBindBuffer(GL_ARRAY_BUFFER, *pbo);
+    glDeleteBuffers(1, pbo);
+    
+    *pbo = (GLuint)NULL;
+  }
+}
+
+void deleteTexture(GLuint* tex){
+    glDeleteTextures(1, tex);
+    *tex = (GLuint)NULL;
+}
+ 
+void shut_down(int return_code){
+  kernelCleanup();
+  cudaDeviceReset();
+  #ifdef __APPLE__
+  glfwTerminate();
+  #endif
+  exit(return_code);
+}
diff --git a/src/main.h b/src/main.h
index 63bf0fa..127d938 100755
--- a/src/main.h
+++ b/src/main.h
@@ -1,105 +1,107 @@
-// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
-// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
-
-#ifndef MAIN_H
-#define MAIN_H
-
-#ifdef __APPLE__
-	#include <GL/glfw.h>
-#else
-	#include <GL/glew.h>
-	#include <GL/glut.h>
-#endif
-
-#include <stdlib.h>
-#include <cuda_runtime.h>
-#include <cutil_inline.h>
-#include <cutil_gl_inline.h>
-#include <cuda_gl_interop.h>
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <time.h>
-#include "glslUtility.h"
-#include "glm/glm.hpp"
-#include "rasterizeKernels.h"
-#include "utilities.h"
-#include "ObjCore/objloader.h"
-
-using namespace std;
-
-//-------------------------------
-//------------GL STUFF-----------
-//-------------------------------
-int frame;
-int fpstracker;
-double seconds;
-int fps = 0;
-GLuint positionLocation = 0;
-GLuint texcoordsLocation = 1;
-const char *attributeLocations[] = { "Position", "Tex" };
-GLuint pbo = (GLuint)NULL;
-GLuint displayImage;
-uchar4 *dptr;
-
-obj* mesh;
-
-float* vbo;
-int vbosize;
-float* cbo;
-int cbosize;
-int* ibo;
-int ibosize;
-
-//-------------------------------
-//----------CUDA STUFF-----------
-//-------------------------------
-
-int width=800; int height=800;
-
-//-------------------------------
-//-------------MAIN--------------
-//-------------------------------
-
-int main(int argc, char** argv);
-
-//-------------------------------
-//---------RUNTIME STUFF---------
-//-------------------------------
-
-void runCuda();
-
-#ifdef __APPLE__
-	void display();
-#else
-	void display();
-	void keyboard(unsigned char key, int x, int y);
-#endif
-
-//-------------------------------
-//----------SETUP STUFF----------
-//-------------------------------
-
-#ifdef __APPLE__
-	void init();
-#else
-	void init(int argc, char* argv[]);
-#endif
-
-void initPBO(GLuint* pbo);
-void initCuda();
-void initTextures();
-void initVAO();
-GLuint initShader(const char *vertexShaderPath, const char *fragmentShaderPath);
-
-//-------------------------------
-//---------CLEANUP STUFF---------
-//-------------------------------
-
-void cleanupCuda();
-void deletePBO(GLuint* pbo);
-void deleteTexture(GLuint* tex);
-void shut_down(int return_code);
-
+// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
+// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
+
+#ifndef MAIN_H
+#define MAIN_H
+
+#ifdef __APPLE__
+	#include <GL/glfw.h>
+#else
+	#include <GL/glew.h>
+	#include <GL/glut.h>
+#endif
+
+#include <stdlib.h>
+#include <cuda_runtime.h>
+#include <cutil_inline.h>
+#include <cutil_gl_inline.h>
+#include <cuda_gl_interop.h>
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <time.h>
+#include "glslUtility.h"
+#include "glm/glm.hpp"
+#include "rasterizeKernels.h"
+#include "utilities.h"
+#include "ObjCore/objloader.h"
+
+using namespace std;
+
+//-------------------------------
+//------------GL STUFF-----------
+//-------------------------------
+int frame;
+int fpstracker;
+double seconds;
+int fps = 0;
+GLuint positionLocation = 0;
+GLuint texcoordsLocation = 1;
+const char *attributeLocations[] = { "Position", "Tex" };
+GLuint pbo = (GLuint)NULL;
+GLuint displayImage;
+uchar4 *dptr;
+
+obj* mesh;
+
+float* vbo;
+float* nbo;
+int nbosize;
+int vbosize;
+float* cbo;
+int cbosize;
+int* ibo;
+int ibosize;
+
+//-------------------------------
+//----------CUDA STUFF-----------
+//-------------------------------
+
+int width=800; int height=800;
+
+//-------------------------------
+//-------------MAIN--------------
+//-------------------------------
+
+int main(int argc, char** argv);
+
+//-------------------------------
+//---------RUNTIME STUFF---------
+//-------------------------------
+
+void runCuda();
+
+#ifdef __APPLE__
+	void display();
+#else
+	void display();
+	void keyboard(unsigned char key, int x, int y);
+#endif
+
+//-------------------------------
+//----------SETUP STUFF----------
+//-------------------------------
+
+#ifdef __APPLE__
+	void init();
+#else
+	void init(int argc, char* argv[]);
+#endif
+
+void initPBO(GLuint* pbo);
+void initCuda();
+void initTextures();
+void initVAO();
+GLuint initShader(const char *vertexShaderPath, const char *fragmentShaderPath);
+
+//-------------------------------
+//---------CLEANUP STUFF---------
+//-------------------------------
+
+void cleanupCuda();
+void deletePBO(GLuint* pbo);
+void deleteTexture(GLuint* tex);
+void shut_down(int return_code);
+
 #endif
\ No newline at end of file
diff --git a/src/rasterizeKernels.cu b/src/rasterizeKernels.cu
index 826ec80..0b4c988 100755
--- a/src/rasterizeKernels.cu
+++ b/src/rasterizeKernels.cu
@@ -1,267 +1,627 @@
-// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
-// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
-
-#include <stdio.h>
-#include <cuda.h>
-#include <cmath>
-#include <cutil_math.h>
-#include <thrust/random.h>
-#include "rasterizeKernels.h"
-#include "rasterizeTools.h"
-
-glm::vec3* framebuffer;
-fragment* depthbuffer;
-float* device_vbo;
-float* device_cbo;
-int* device_ibo;
-triangle* primitives;
-
-void checkCUDAError(const char *msg) {
-  cudaError_t err = cudaGetLastError();
-  if( cudaSuccess != err) {
-    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
-    exit(EXIT_FAILURE); 
-  }
-} 
-
-//Handy dandy little hashing function that provides seeds for random number generation
-__host__ __device__ unsigned int hash(unsigned int a){
-    a = (a+0x7ed55d16) + (a<<12);
-    a = (a^0xc761c23c) ^ (a>>19);
-    a = (a+0x165667b1) + (a<<5);
-    a = (a+0xd3a2646c) ^ (a<<9);
-    a = (a+0xfd7046c5) + (a<<3);
-    a = (a^0xb55a4f09) ^ (a>>16);
-    return a;
-}
-
-//Writes a given fragment to a fragment buffer at a given location
-__host__ __device__ void writeToDepthbuffer(int x, int y, fragment frag, fragment* depthbuffer, glm::vec2 resolution){
-  if(x<resolution.x && y<resolution.y){
-    int index = (y*resolution.x) + x;
-    depthbuffer[index] = frag;
-  }
-}
-
-//Reads a fragment from a given location in a fragment buffer
-__host__ __device__ fragment getFromDepthbuffer(int x, int y, fragment* depthbuffer, glm::vec2 resolution){
-  if(x<resolution.x && y<resolution.y){
-    int index = (y*resolution.x) + x;
-    return depthbuffer[index];
-  }else{
-    fragment f;
-    return f;
-  }
-}
-
-//Writes a given pixel to a pixel buffer at a given location
-__host__ __device__ void writeToFramebuffer(int x, int y, glm::vec3 value, glm::vec3* framebuffer, glm::vec2 resolution){
-  if(x<resolution.x && y<resolution.y){
-    int index = (y*resolution.x) + x;
-    framebuffer[index] = value;
-  }
-}
-
-//Reads a pixel from a pixel buffer at a given location
-__host__ __device__ glm::vec3 getFromFramebuffer(int x, int y, glm::vec3* framebuffer, glm::vec2 resolution){
-  if(x<resolution.x && y<resolution.y){
-    int index = (y*resolution.x) + x;
-    return framebuffer[index];
-  }else{
-    return glm::vec3(0,0,0);
-  }
-}
-
-//Kernel that clears a given pixel buffer with a given color
-__global__ void clearImage(glm::vec2 resolution, glm::vec3* image, glm::vec3 color){
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * resolution.x);
-    if(x<=resolution.x && y<=resolution.y){
-      image[index] = color;
-    }
-}
-
-//Kernel that clears a given fragment buffer with a given fragment
-__global__ void clearDepthBuffer(glm::vec2 resolution, fragment* buffer, fragment frag){
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * resolution.x);
-    if(x<=resolution.x && y<=resolution.y){
-      fragment f = frag;
-      f.position.x = x;
-      f.position.y = y;
-      buffer[index] = f;
-    }
-}
-
-//Kernel that writes the image to the OpenGL PBO directly. 
-__global__ void sendImageToPBO(uchar4* PBOpos, glm::vec2 resolution, glm::vec3* image){
-  
-  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-  int index = x + (y * resolution.x);
-  
-  if(x<=resolution.x && y<=resolution.y){
-
-      glm::vec3 color;      
-      color.x = image[index].x*255.0;
-      color.y = image[index].y*255.0;
-      color.z = image[index].z*255.0;
-
-      if(color.x>255){
-        color.x = 255;
-      }
-
-      if(color.y>255){
-        color.y = 255;
-      }
-
-      if(color.z>255){
-        color.z = 255;
-      }
-      
-      // Each thread writes one pixel location in the texture (textel)
-      PBOpos[index].w = 0;
-      PBOpos[index].x = color.x;     
-      PBOpos[index].y = color.y;
-      PBOpos[index].z = color.z;
-  }
-}
-
-//TODO: Implement a vertex shader
-__global__ void vertexShadeKernel(float* vbo, int vbosize){
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if(index<vbosize/3){
-  }
-}
-
-//TODO: Implement primative assembly
-__global__ void primitiveAssemblyKernel(float* vbo, int vbosize, float* cbo, int cbosize, int* ibo, int ibosize, triangle* primitives){
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int primitivesCount = ibosize/3;
-  if(index<primitivesCount){
-  }
-}
-
-//TODO: Implement a rasterization method, such as scanline.
-__global__ void rasterizationKernel(triangle* primitives, int primitivesCount, fragment* depthbuffer, glm::vec2 resolution){
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if(index<primitivesCount){
-  }
-}
-
-//TODO: Implement a fragment shader
-__global__ void fragmentShadeKernel(fragment* depthbuffer, glm::vec2 resolution){
-  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-  int index = x + (y * resolution.x);
-  if(x<=resolution.x && y<=resolution.y){
-  }
-}
-
-//Writes fragment colors to the framebuffer
-__global__ void render(glm::vec2 resolution, fragment* depthbuffer, glm::vec3* framebuffer){
-
-  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-  int index = x + (y * resolution.x);
-
-  if(x<=resolution.x && y<=resolution.y){
-    framebuffer[index] = depthbuffer[index].color;
-  }
-}
-
-// Wrapper for the __global__ call that sets up the kernel calls and does a ton of memory management
-void cudaRasterizeCore(uchar4* PBOpos, glm::vec2 resolution, float frame, float* vbo, int vbosize, float* cbo, int cbosize, int* ibo, int ibosize){
-
-  // set up crucial magic
-  int tileSize = 8;
-  dim3 threadsPerBlock(tileSize, tileSize);
-  dim3 fullBlocksPerGrid((int)ceil(float(resolution.x)/float(tileSize)), (int)ceil(float(resolution.y)/float(tileSize)));
-
-  //set up framebuffer
-  framebuffer = NULL;
-  cudaMalloc((void**)&framebuffer, (int)resolution.x*(int)resolution.y*sizeof(glm::vec3));
-  
-  //set up depthbuffer
-  depthbuffer = NULL;
-  cudaMalloc((void**)&depthbuffer, (int)resolution.x*(int)resolution.y*sizeof(fragment));
-
-  //kernel launches to black out accumulated/unaccumlated pixel buffers and clear our scattering states
-  clearImage<<<fullBlocksPerGrid, threadsPerBlock>>>(resolution, framebuffer, glm::vec3(0,0,0));
-  
-  fragment frag;
-  frag.color = glm::vec3(0,0,0);
-  frag.normal = glm::vec3(0,0,0);
-  frag.position = glm::vec3(0,0,-10000);
-  clearDepthBuffer<<<fullBlocksPerGrid, threadsPerBlock>>>(resolution, depthbuffer,frag);
-
-  //------------------------------
-  //memory stuff
-  //------------------------------
-  primitives = NULL;
-  cudaMalloc((void**)&primitives, (ibosize/3)*sizeof(triangle));
-
-  device_ibo = NULL;
-  cudaMalloc((void**)&device_ibo, ibosize*sizeof(int));
-  cudaMemcpy( device_ibo, ibo, ibosize*sizeof(int), cudaMemcpyHostToDevice);
-
-  device_vbo = NULL;
-  cudaMalloc((void**)&device_vbo, vbosize*sizeof(float));
-  cudaMemcpy( device_vbo, vbo, vbosize*sizeof(float), cudaMemcpyHostToDevice);
-
-  device_cbo = NULL;
-  cudaMalloc((void**)&device_cbo, cbosize*sizeof(float));
-  cudaMemcpy( device_cbo, cbo, cbosize*sizeof(float), cudaMemcpyHostToDevice);
-
-  tileSize = 32;
-  int primitiveBlocks = ceil(((float)vbosize/3)/((float)tileSize));
-
-  //------------------------------
-  //vertex shader
-  //------------------------------
-  vertexShadeKernel<<<primitiveBlocks, tileSize>>>(device_vbo, vbosize);
-
-  cudaDeviceSynchronize();
-  //------------------------------
-  //primitive assembly
-  //------------------------------
-  primitiveBlocks = ceil(((float)ibosize/3)/((float)tileSize));
-  primitiveAssemblyKernel<<<primitiveBlocks, tileSize>>>(device_vbo, vbosize, device_cbo, cbosize, device_ibo, ibosize, primitives);
-
-  cudaDeviceSynchronize();
-  //------------------------------
-  //rasterization
-  //------------------------------
-  rasterizationKernel<<<primitiveBlocks, tileSize>>>(primitives, ibosize/3, depthbuffer, resolution);
-
-  cudaDeviceSynchronize();
-  //------------------------------
-  //fragment shader
-  //------------------------------
-  fragmentShadeKernel<<<fullBlocksPerGrid, threadsPerBlock>>>(depthbuffer, resolution);
-
-  cudaDeviceSynchronize();
-  //------------------------------
-  //write fragments to framebuffer
-  //------------------------------
-  render<<<fullBlocksPerGrid, threadsPerBlock>>>(resolution, depthbuffer, framebuffer);
-  sendImageToPBO<<<fullBlocksPerGrid, threadsPerBlock>>>(PBOpos, resolution, framebuffer);
-
-  cudaDeviceSynchronize();
-
-  kernelCleanup();
-
-  checkCUDAError("Kernel failed!");
-}
-
-void kernelCleanup(){
-  cudaFree( primitives );
-  cudaFree( device_vbo );
-  cudaFree( device_cbo );
-  cudaFree( device_ibo );
-  cudaFree( framebuffer );
-  cudaFree( depthbuffer );
-}
-
+// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
+// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
+#include <thrust/random.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <cmath>
+#include <cutil_math.h>
+#include "rasterizeKernels.h"
+#include "rasterizeTools.h"
+#include "glm\gtc\/matrix_transform.hpp"
+
+glm::vec3* framebuffer;
+fragment* depthbuffer;
+int* device_stencil;
+float* device_vbo;
+float* device_cbo;
+int* device_ibo;
+float* device_nbo;
+triangle* primitives;
+
+void checkCUDAError(const char *msg) {
+  cudaError_t err = cudaGetLastError();
+  if( cudaSuccess != err) {
+    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
+    exit(EXIT_FAILURE); 
+  }
+} 
+
+//Handy dandy little hashing function that provides seeds for random number generation
+__host__ __device__ unsigned int hash(unsigned int a){
+    a = (a+0x7ed55d16) + (a<<12);
+    a = (a^0xc761c23c) ^ (a>>19);
+    a = (a+0x165667b1) + (a<<5);
+    a = (a+0xd3a2646c) ^ (a<<9);
+    a = (a+0xfd7046c5) + (a<<3);
+    a = (a^0xb55a4f09) ^ (a>>16);
+    return a;
+}
+
+//Writes a given fragment to a fragment buffer at a given location
+__host__ __device__ void writeToDepthbuffer(int x, int y, fragment frag, fragment* depthbuffer, glm::vec2 resolution){
+  if(x<resolution.x && y<resolution.y){
+    int index = (y*resolution.x) + x;
+    depthbuffer[index] = frag;
+  }
+}
+
+//Reads a fragment from a given location in a fragment buffer
+__host__ __device__ fragment getFromDepthbuffer(int x, int y, fragment* depthbuffer, glm::vec2 resolution){
+  if(x<resolution.x && y<resolution.y){
+    int index = (y*resolution.x) + x;
+    return depthbuffer[index];
+  }else{
+    fragment f;
+    return f;
+  }
+}
+
+//Writes a given pixel to a pixel buffer at a given location
+__host__ __device__ void writeToFramebuffer(int x, int y, glm::vec3 value, glm::vec3* framebuffer, glm::vec2 resolution){
+  if(x<resolution.x && y<resolution.y){
+    int index = (y*resolution.x) + x;
+    framebuffer[index] = value;
+  }
+}
+
+//Reads a pixel from a pixel buffer at a given location
+__host__ __device__ glm::vec3 getFromFramebuffer(int x, int y, glm::vec3* framebuffer, glm::vec2 resolution){
+  if(x<resolution.x && y<resolution.y){
+    int index = (y*resolution.x) + x;
+    return framebuffer[index];
+  }else{
+    return glm::vec3(0,0,0);
+  }
+}
+
+//Kernel that clears a given pixel buffer with a given color
+__global__ void clearImage(glm::vec2 resolution, glm::vec3* image, glm::vec3 color){
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    int index = x + (y * resolution.x);
+    if(x<=resolution.x && y<=resolution.y){
+      image[index] = color;
+    }
+}
+
+//Kernel that clears a given fragment buffer with a given fragment
+__global__ void clearDepthBuffer(glm::vec2 resolution, fragment* buffer, fragment frag){
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    int index = x + (y * resolution.x);
+    if(x<=resolution.x && y<=resolution.y){
+      fragment f = frag;
+      f.position.x = x;
+      f.position.y = y;
+      buffer[index] = f;
+    }
+}
+
+//Kernel that writes the image to the OpenGL PBO directly. 
+__global__ void sendImageToPBO(uchar4* PBOpos, glm::vec2 resolution, glm::vec3* image){
+  
+  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+  int index = x + (y * resolution.x);
+  
+  if(x<=resolution.x && y<=resolution.y){
+
+      glm::vec3 color;      
+      color.x = image[index].x*255.0;
+      color.y = image[index].y*255.0;
+      color.z = image[index].z*255.0;
+
+      if(color.x>255){
+        color.x = 255;
+      }
+
+      if(color.y>255){
+        color.y = 255;
+      }
+
+      if(color.z>255){
+        color.z = 255;
+      }
+      
+      // Each thread writes one pixel location in the texture (textel)
+	  y=resolution.y-y;
+	  x=resolution.x-x;
+	  index=x+(y*resolution.x);
+
+
+      PBOpos[index].w = 0;
+      PBOpos[index].x = color.x;     
+      PBOpos[index].y = color.y;
+      PBOpos[index].z = color.z;
+  }
+}
+
+//TODO: Implement a vertex shader
+__global__ void vertexShadeKernel(float* vbo, int vbosize , cudaMat4 project){//, float *nbo){
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if(index<vbosize/3){
+
+	  //printf("vbo before tranform %f   %f    %f ",vbo[3*index+0],vbo[3*index+1],vbo[3*index+2]);
+
+	 // for (int i=0;i<vbosize;i++)
+	  //{
+	//	  printf("vbo before tranform %f  ",vbo[i]);
+	 //}
+
+	  
+	  glm::vec4 newvbo= glm::vec4(vbo[3*index],vbo[3*index+1],vbo[3*index+2],1); //for point its 1
+	 // glm::vec4 newnbo= glm::vec4(nbo[3*index],vbo[3*index+1],vbo[3*index+2],1);
+
+	  glm::vec3 projectedvbo=multiplyMV(project,newvbo);
+	 // glm::vec3 projectednbo= multiplyMV(project,newnbo);
+
+	  vbo[3*index]=projectedvbo.x;
+	  vbo[3*index+1]=projectedvbo.y;
+	  vbo[3*index+2]=projectedvbo.z;
+
+	  
+
+	  //printf("  vbo after tranform %f   %f    %f ",vbo[3*index],vbo[3*index+1],vbo[3*index+2]);
+
+  }
+}
+
+//TODO: Implement primative assembly
+__global__ void primitiveAssemblyKernel(float* vbo, int vbosize, float* cbo, int cbosize, int* ibo, int ibosize, triangle* primitives, float* nbo, int nbosize){
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int primitivesCount = ibosize/3;
+  if(index<primitivesCount){
+
+	  primitives[index].p0.x=vbo[9*index+0];	primitives[index].p0.y=vbo[9*index+1];	primitives[index].p0.z=vbo[9*index+2];
+	  primitives[index].p1.x=vbo[9*index+3];	primitives[index].p1.y=vbo[9*index+4];	primitives[index].p1.z=vbo[9*index+5];
+	  primitives[index].p2.x=vbo[9*index+6];	primitives[index].p2.y=vbo[9*index+7];	primitives[index].p2.z=vbo[9*index+8];
+		
+
+	  primitives[index].c0.x=cbo[0];	primitives[index].c0.y=cbo[1];	primitives[index].c0.z=cbo[2];
+	  primitives[index].c1.x=cbo[3];	primitives[index].c1.y=cbo[4];	primitives[index].c1.z=cbo[5];
+	  primitives[index].c2.x=cbo[6];	primitives[index].c2.y=cbo[7];	primitives[index].c2.z=cbo[8];
+
+	  
+	  primitives[index].n0.x=nbo[9*index+0];	primitives[index].n0.y=nbo[9*index+1];	primitives[index].n0.z=nbo[9*index+2];
+  	  primitives[index].n1.x=nbo[9*index+3];	primitives[index].n1.y=nbo[9*index+4];	primitives[index].n1.z=nbo[9*index+5];
+	  primitives[index].n2.x=nbo[9*index+6];	primitives[index].n2.y=nbo[9*index+7];	primitives[index].n2.z=nbo[9*index+8];
+  
+	 //printf("nbo = %f  \n", nbo[0]);
+  }
+}
+//TODO: Implement a rasterization method, such as scanline.
+__global__ void rasterizationKernel(triangle* primitives, int primitivesCount, fragment* depthbuffer, glm::vec2 resolution)
+{
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if(index<primitivesCount){
+	  if (index == 2181)
+	  {
+		  printf("Before %i -- P0: (%f,%f), P1(%f, %f), P2: (%f,%f)\n", index, 
+			  primitives[index].p0.x, primitives[index].p0.y,
+			  primitives[index].p1.x, primitives[index].p1.y,
+			  primitives[index].p2.x, primitives[index].p2.y);
+	  }
+
+	  primitives[index].p0.x=(primitives[index].p0.x+1) *resolution.x/2;
+	  primitives[index].p0.y=(primitives[index].p0.y+1) *resolution.y/2;
+
+	  primitives[index].p1.x=(primitives[index].p1.x+1) *resolution.x/2;
+	  primitives[index].p1.y=(primitives[index].p1.y+1) *resolution.y/2;
+
+	  primitives[index].p2.x=(primitives[index].p2.x+1) *resolution.x/2;
+	  primitives[index].p2.y=(primitives[index].p2.y+1) *resolution.y/2;
+	  
+	  if (index == 2181)
+	  {
+		  printf("After %i -- P0: (%f,%f), P1(%f, %f), P2: (%f,%f)\n", index, 
+			  primitives[index].p0.x, primitives[index].p0.y,
+			  primitives[index].p1.x, primitives[index].p1.y,
+			  primitives[index].p2.x, primitives[index].p2.y);
+	  }
+
+	  int bottom=min(min(primitives[index].p0.y, primitives[index].p1.y),primitives[index].p2.y)-1;
+      int top = max(max(primitives[index].p0.y, primitives[index].p1.y),primitives[index].p2.y) + 1;
+
+	 // float top= std::max(primitives[index].p0.y, primitives[index].p1.y, primitives[index].p2.y);
+	 // float bottom= std::min(primitives[index].p0.y, primitives[index].p1.y, primitives[index].p2.y);
+
+	  if (index == 2181)
+	  {
+		  printf(" Bottom Top: (%i,%i)\n", bottom, top);
+	  }
+	  
+	  float slopep0p1,slopep2p0, slopep1p2;
+	  //slopep0p1=1;slopep2p0=1; slopep1p2=1;
+	  if (top >=resolution.y)
+	  {
+			top=resolution.y-1;
+	  }
+	  else if (bottom <= 0 )
+	  {
+		  bottom=0;
+	  }
+
+	  int currentpoint=0;
+	  currentpoint=top;
+	  int xmin, xmax;
+	  float xvaluetest1,xvaluetest2,xvaluetest3;
+
+	  bool tri=false;
+	  
+	  while(currentpoint!=bottom)
+	  {		
+		xmax=-1000000;
+	    xmin=10000000;
+		//////// CASE2
+
+		if( (primitives[index].p1.x - primitives[index].p0.x)!=0)
+		{
+			if (primitives[index].p1.y+0.0001 <= primitives[index].p0.y && primitives[index].p1.y-0.0001 >= primitives[index].p0.y)
+			{
+				/*if (index == 1797) printf("Here\n");*/
+				xmin=min(primitives[index].p1.x,primitives[index].p0.x);
+				xmax=max(primitives[index].p1.x,primitives[index].p0.x);
+			}
+			else
+			{
+				if (index == 2181) printf("in else\n");
+				slopep0p1= (primitives[index].p1.y - primitives[index].p0.y) / (primitives[index].p1.x - primitives[index].p0.x);
+				
+				xvaluetest1=(currentpoint-primitives[index].p1.y)/slopep0p1 + primitives[index].p1.x;
+				/*if (xvaluetest1>=0 && xvaluetest1<resolution.x)
+				{*/
+
+				if (index == 2181) printf("slopep0p1=%f xvaluetest1= %f   current point= %i \n",slopep0p1,xvaluetest1,currentpoint);
+
+				if ((xvaluetest1 <= primitives[index].p0.x && xvaluetest1 >= primitives[index].p1.x)
+				||(xvaluetest1 >= primitives[index].p0.x && xvaluetest1 <= primitives[index].p1.x))
+				{
+
+					/*if (index == 1797) printf( "checking xvaluetest1 \n");*/
+				  if(xvaluetest1<xmin)
+				  {
+					  xmin=xvaluetest1;
+				  }
+				  else if (xvaluetest1>xmax)
+				  {
+					  xmax=xvaluetest1;
+				  }
+				}
+
+				if (index == 2181)
+					printf("Xmin,Xmax After P0P1: (%i, %i)\n", xmin, xmax);
+			//}
+			}
+		}
+		else  
+		{
+				xmin=min((int)primitives[index].p1.x,xmin); 
+				xmax=max((int)primitives[index].p1.x,xmax);
+		}
+
+		/////////// CASE2
+
+		if ((primitives[index].p2.x - primitives[index].p1.x)!=0)
+		{
+			if (primitives[index].p2.y == primitives[index].p1.y)
+			{
+				xmin=min((int)min(primitives[index].p2.x, primitives[index].p1.x),xmin);
+				xmax=max((int)max(primitives[index].p2.x, primitives[index].p1.x),xmax);
+			}
+			else
+			{
+				if (index == 2181)
+					printf("Xmin,Xmax Before P1P2: (%i, %i)\n", xmin, xmax);
+				slopep1p2= (primitives[index].p2.y - primitives[index].p1.y) / (primitives[index].p2.x - primitives[index].p1.x);
+				xvaluetest2=(currentpoint-primitives[index].p2.y)/slopep1p2 + primitives[index].p2.x;
+
+				if (index == 2181) printf("slopep2p1=%f xvaluetest2= %f ,currentpoint= %i   \n",slopep1p2,xvaluetest2, currentpoint);
+
+				if ((xvaluetest2 <= primitives[index].p1.x && xvaluetest2 >= primitives[index].p2.x) || 
+					(xvaluetest2 >= primitives[index].p1.x && xvaluetest2 <= primitives[index].p2.x))
+				{
+					if(xvaluetest2>=0 && xvaluetest2<=xmin)
+					xmin=xvaluetest2;
+					if(xvaluetest2<resolution.x && xvaluetest2>xmax)
+					{
+						xmax=xvaluetest2;
+					}
+				}
+				
+				if (index == 2181)
+					printf("Xmin,Xmax After P1P2: (%i, %i)\n", xmin, xmax);
+			}
+		}
+		else  
+		{
+				xmin=min((int)primitives[index].p1.x,xmin);
+				xmax=max((int)primitives[index].p1.x,xmax);	 
+		}
+
+
+		////////////////////// CASE3
+
+		if (primitives[index].p0.x - primitives[index].p2.x!=0)
+		{
+			if (primitives[index].p0.y == primitives[index].p2.y)
+			{
+				xmin=min((int)min(primitives[index].p0.x, primitives[index].p2.x),xmin);
+				xmax=max((int)max(primitives[index].p0.x, primitives[index].p2.x),xmax);
+			}
+			else
+			{
+				if (index == 2181)
+					printf("Xmin,Xmax Before P0P2: (%i, %i) - CurrentPoint- %i\n", xmin, xmax, currentpoint);
+
+				slopep2p0= (primitives[index].p0.y - primitives[index].p2.y) / (primitives[index].p0.x - primitives[index].p2.x);
+				xvaluetest3=(currentpoint-primitives[index].p0.y)/slopep2p0 + primitives[index].p0.x ;
+				
+				if (index == 2181) printf("slopep2p0=%f xvaluetest3= %d   \n",slopep2p0,xvaluetest3);
+
+				if (xvaluetest3>=0 && xvaluetest3<resolution.x)
+				{
+					
+					if ((xvaluetest3 <= primitives[index].p2.x && xvaluetest3 >= primitives[index].p0.x) || 
+					(xvaluetest3 >= primitives[index].p2.x && xvaluetest3 <= primitives[index].p0.x))
+					{
+						if(xvaluetest3>=0 && xvaluetest3<xmin)
+						xmin=xvaluetest3;
+						if(xvaluetest3<resolution.x && xvaluetest3>xmax)
+						xmax=xvaluetest3;
+					}
+				
+				}
+				if (index == 2181)
+					printf("Xmin,Xmax After P1P2: (%i, %i)\n", xmin, xmax);
+			}
+					
+		}
+		else  
+		{
+				xmin=min((int)primitives[index].p2.x,xmin);
+				xmax=max((int)primitives[index].p2.x,xmax);  
+		}
+	  
+		glm::vec3 barry = calculateBarycentricCoordinate(primitives[index], glm::vec2(xmin, currentpoint));
+		  // using (y-y1)/m + x1=xB
+		  // here y is currentpoint
+		
+		while(xmin<=xmax)
+		  {
+				  int pixel_index= xmin+currentpoint*resolution.x;
+
+				  fragment newfrag;
+				  newfrag.color=barry.x*primitives[index].c0 +  barry.y*primitives[index].c1  + barry.z*primitives[index].c2;
+				  newfrag.normal= glm::normalize(barry.x*primitives[index].n0 + barry.y*primitives[index].n1 + barry.z*primitives[index].n2);
+				  newfrag.lock=1;
+				  newfrag.position.x= xmin;
+				  newfrag.position.y= currentpoint;
+
+				  //atomic comapre and swap
+				  bool loop=true;
+				  while(loop)
+				  {
+						  if( xmin < resolution.x && xmin>=0 && currentpoint<resolution.y && currentpoint>0  )
+						  {
+							  /*if(xmin==0)
+							  {
+								  printf("index  %d \n", index);
+							  }
+							  tri=true;*/
+							  if( depthbuffer[index].position.z < newfrag.position.z)
+							  {
+									//if (atomicExch(&(depthbuffer[pixel_index].lock), 1) == 0)
+									{
+										depthbuffer[pixel_index]= newfrag;
+										loop=false;
+									//	atomicExch(&(depthbuffer[pixel_index].lock),0); 
+									}
+							  }
+							// printf("some %f",depthbuffer[pixel_index].normal.y);
+							  else
+							  {
+								 loop=false;
+							  }
+
+						  }
+				  }
+				  xmin++;
+		  }
+		  currentpoint--;
+	  }
+
+	  /*if (tri==false)
+	  {
+		  printf("index %d \n", index);
+	  }*/
+  }
+}
+
+//TODO: Implement a fragment shader
+__global__ void fragmentShadeKernel(fragment* depthbuffer, glm::vec2 resolution, glm::vec3 lightpos, glm::vec3 lightcol, int* device_stencil)
+  {
+	  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+		int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+		int index = x + (y * resolution.x);
+
+		//setting up the stencil
+	
+	  if(x<=resolution.x && y<=resolution.y)
+	  {
+		  if ( device_stencil[index]==1)
+		  {
+
+		  glm::vec3 normal= glm::normalize(depthbuffer[index].normal);
+		  glm::vec3 L=lightpos-depthbuffer[index].position;
+		  float diffuse=glm::clamp((glm::dot(normal,glm::normalize(L)),0.0),0.0,1.0);
+		  
+		  glm::vec3 final_col= diffuse*lightcol * depthbuffer[index].color;
+
+		  //depthbuffer[index].color = final_col;
+		  }
+		}
+
+}
+
+//Writes fragment colors to the framebuffer
+__global__ void render(glm::vec2 resolution, fragment* depthbuffer, glm::vec3* framebuffer){
+
+  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+  int index = x + (y * resolution.x);
+
+  if(x<=resolution.x && y<=resolution.y){
+    framebuffer[index] = depthbuffer[index].color;
+  }
+}
+
+// Wrapper for the __global__ call that sets up the kernel calls and does a ton of memory management
+void cudaRasterizeCore(uchar4* PBOpos, glm::vec2 resolution, float frame, float* vbo, int vbosize, float* cbo, int cbosize, int* ibo, int ibosize, int nbosize, float* nbo, glm::vec3 lightpos, glm::vec3 lightcol){
+
+	//set uf the stencil buffer
+  device_stencil =NULL;
+  cudaMalloc((void**)&device_stencil, (int) resolution.x*(int)resolution.y*sizeof(int));
+
+  int totalpixels= resolution.x*resolution.y;
+
+  int* stencil=new int[totalpixels];
+
+	
+	cudaMalloc((void**)&device_stencil, (int) resolution.x*(int)resolution.y*sizeof(int));
+	cudaMemcpy( device_stencil, stencil, totalpixels*sizeof(bool), cudaMemcpyHostToDevice);
+
+
+  // set up crucial magic
+  int tileSize = 8;
+
+  dim3 threadsPerBlock(tileSize, tileSize);
+  dim3 fullBlocksPerGrid((int)ceil(float(resolution.x)/float(tileSize)), (int)ceil(float(resolution.y)/float(tileSize)));
+
+  //set up framebuffer
+  framebuffer = NULL;
+  cudaMalloc((void**)&framebuffer, (int)resolution.x*(int)resolution.y*sizeof(glm::vec3));
+  
+  //set up depthbuffer
+  depthbuffer = NULL;
+  cudaMalloc((void**)&depthbuffer, (int)resolution.x*(int)resolution.y*sizeof(fragment));
+
+  
+
+  
+	
+
+  //kernel launches to black out accumulated/unaccumlated pixel buffers and clear our scattering states
+  clearImage<<<fullBlocksPerGrid, threadsPerBlock>>>(resolution, framebuffer, glm::vec3(0,0,0));
+  
+  fragment frag;
+  frag.color = glm::vec3(0,0,0);
+  frag.normal = glm::vec3(0,0,0);
+  frag.position = glm::vec3(0,0,-10000);
+  clearDepthBuffer<<<fullBlocksPerGrid, threadsPerBlock>>>(resolution, depthbuffer,frag);
+
+  //------------------------------
+  //memory stuff
+  //------------------------------
+  primitives = NULL;
+  cudaMalloc((void**)&primitives, (ibosize/3)*sizeof(triangle));
+
+  device_ibo = NULL;
+  cudaMalloc((void**)&device_ibo, ibosize*sizeof(int));
+  cudaMemcpy( device_ibo, ibo, ibosize*sizeof(int), cudaMemcpyHostToDevice);
+
+  device_nbo =NULL;
+  cudaMalloc ((void**)&device_nbo, nbosize*sizeof(float));
+  cudaMemcpy(device_nbo, nbo, nbosize*sizeof(float),cudaMemcpyHostToDevice);
+
+  device_vbo = NULL;
+  cudaMalloc((void**)&device_vbo, vbosize*sizeof(float));
+  cudaMemcpy( device_vbo, vbo, vbosize*sizeof(float), cudaMemcpyHostToDevice);
+
+  device_cbo = NULL;
+  cudaMalloc((void**)&device_cbo, cbosize*sizeof(float));
+  cudaMemcpy( device_cbo, cbo, cbosize*sizeof(float), cudaMemcpyHostToDevice);
+
+  tileSize = 32;
+  int primitiveBlocks = ceil(((float)vbosize/3)/((float)tileSize));
+
+  //------------------------------
+  //vertex shader
+  //------------------------------
+
+  
+  
+  //setting up camera first
+
+  glm::vec3 CameraPosition = glm::vec3(0.0f, 0.5f, 7.0f);
+  int width = resolution.x;
+  int height = resolution.y;
+		
+  glm::mat4 projection = glm::perspective(60.0f, static_cast<float>(width) / static_cast<float>(height), 0.1f, 50.0f);
+	 	
+  glm::mat4 camera = glm::lookAt(CameraPosition, glm::vec3(0.0, 0.5, 0), glm::vec3(0, 1, 0));
+	 	
+  projection = projection * camera;
+	 	
+  cudaMat4 cudaProjection = utilityCore::glmMat4ToCudaMat4(projection);
+	 	
+  glm::mat4 invProjection = glm::inverse(projection);
+	 	
+  cudaMat4 cudaInvProjection = utilityCore::glmMat4ToCudaMat4(invProjection);
+	 	
+  
+  /*
+  glm::mat4 projection=glm::perspective(60.0f, static_cast<float>(resolution.x)/ static_cast<float>(resolution.y),0.1f, 30.0f); 
+  glm::vec3 cameraposition= glm::vec3(0,2,10);
+  glm::mat4 camera= glm::lookAt(cameraposition,glm::vec3(0,0,0),glm::vec3(0,5,0));
+  
+  //projection=projection*camera;
+
+  cudaMat4 project= utilityCore::glmMat4ToCudaMat4(projection);
+  */
+  vertexShadeKernel<<<primitiveBlocks, tileSize>>>(device_vbo, vbosize, cudaProjection);
+  
+
+  cudaDeviceSynchronize();
+  //------------------------------
+  //primitive assembly
+  //------------------------------
+  primitiveBlocks = ceil(((float)ibosize/3)/((float)tileSize));
+  primitiveAssemblyKernel<<<primitiveBlocks, tileSize>>>(device_vbo, vbosize, device_cbo, cbosize, device_ibo, ibosize, primitives,device_nbo,nbosize);
+
+  cudaDeviceSynchronize();
+  //------------------------------
+  //rasterization
+  //------------------------------
+  rasterizationKernel<<<primitiveBlocks, tileSize>>>(primitives, ibosize/3, depthbuffer, resolution);
+
+  cudaDeviceSynchronize();
+  //------------------------------
+  //fragment shader
+  //------------------------------
+  fragmentShadeKernel<<<fullBlocksPerGrid, threadsPerBlock>>>(depthbuffer, resolution, lightpos, lightcol, device_stencil);
+
+  cudaDeviceSynchronize();
+  //------------------------------
+  //write fragments to framebuffer
+  //------------------------------
+  render<<<fullBlocksPerGrid, threadsPerBlock>>>(resolution, depthbuffer, framebuffer);
+  sendImageToPBO<<<fullBlocksPerGrid, threadsPerBlock>>>(PBOpos, resolution, framebuffer);
+
+  cudaDeviceSynchronize();
+
+  kernelCleanup();
+
+  checkCUDAError("Kernel failed!");
+}
+
+void kernelCleanup(){
+  cudaFree( primitives );
+  cudaFree( device_vbo );
+  cudaFree( device_cbo );
+  cudaFree( device_ibo );
+  cudaFree( device_nbo );
+  cudaFree( framebuffer );
+  cudaFree( depthbuffer );
+  cudaFree(device_stencil);
+}
+
diff --git a/src/rasterizeKernels.h b/src/rasterizeKernels.h
index bef3653..355e2f1 100755
--- a/src/rasterizeKernels.h
+++ b/src/rasterizeKernels.h
@@ -1,17 +1,17 @@
-// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
-// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
-
-#ifndef RASTERIZEKERNEL_H
-#define RASTERIZEKERNEL_H
-
-#include <stdio.h>
-#include <thrust/random.h>
-#include <cuda.h>
-#include <cmath>
-#include <cutil_math.h>
-#include "glm/glm.hpp"
-
-void kernelCleanup();
-void cudaRasterizeCore(uchar4* pos, glm::vec2 resolution, float frame, float* vbo, int vbosize, float* cbo, int cbosize, int* ibo, int ibosize);
-
-#endif //RASTERIZEKERNEL_H
+// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
+// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
+
+#ifndef RASTERIZEKERNEL_H
+#define RASTERIZEKERNEL_H
+
+#include <stdio.h>
+#include <thrust/random.h>
+#include <cuda.h>
+#include <cmath>
+#include <cutil_math.h>
+#include "glm/glm.hpp"
+
+void kernelCleanup();
+void cudaRasterizeCore(uchar4* pos, glm::vec2 resolution, float frame, float* vbo, int vbosize, float* cbo, int cbosize, int* ibo, int ibosize, int nbosize, float* nbo, glm::vec3 lightpos, glm::vec3 lightcol);
+
+#endif //RASTERIZEKERNEL_H
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index e9b5dcc..f2b864f 100755
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -1,78 +1,83 @@
-// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
-// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
-
-#ifndef RASTERIZETOOLS_H
-#define RASTERIZETOOLS_H
-
-#include <cmath>
-#include "glm/glm.hpp"
-#include "utilities.h"
-#include "cudaMat4.h"
-
-struct triangle {
-  glm::vec3 p0;
-  glm::vec3 p1;
-  glm::vec3 p2;
-  glm::vec3 c0;
-  glm::vec3 c1;
-  glm::vec3 c2;
-};
-
-struct fragment{
-  glm::vec3 color;
-  glm::vec3 normal;
-  glm::vec3 position;
-};
-
-//Multiplies a cudaMat4 matrix and a vec4
-__host__ __device__ glm::vec3 multiplyMV(cudaMat4 m, glm::vec4 v){
-  glm::vec3 r(1,1,1);
-  r.x = (m.x.x*v.x)+(m.x.y*v.y)+(m.x.z*v.z)+(m.x.w*v.w);
-  r.y = (m.y.x*v.x)+(m.y.y*v.y)+(m.y.z*v.z)+(m.y.w*v.w);
-  r.z = (m.z.x*v.x)+(m.z.y*v.y)+(m.z.z*v.z)+(m.z.w*v.w);
-  return r;
-}
-
-//LOOK: finds the axis aligned bounding box for a given triangle
-__host__ __device__ void getAABBForTriangle(triangle tri, glm::vec3& minpoint, glm::vec3& maxpoint){
-  minpoint = glm::vec3(min(min(tri.p0.x, tri.p1.x),tri.p2.x), 
-        min(min(tri.p0.y, tri.p1.y),tri.p2.y),
-        min(min(tri.p0.z, tri.p1.z),tri.p2.z));
-  maxpoint = glm::vec3(max(max(tri.p0.x, tri.p1.x),tri.p2.x), 
-        max(max(tri.p0.y, tri.p1.y),tri.p2.y),
-        max(max(tri.p0.z, tri.p1.z),tri.p2.z));
-}
-
-//LOOK: calculates the signed area of a given triangle
-__host__ __device__ float calculateSignedArea(triangle tri){
-  return 0.5*((tri.p2.x - tri.p0.x)*(tri.p1.y - tri.p0.y) - (tri.p1.x - tri.p0.x)*(tri.p2.y - tri.p0.y));
-}
-
-//LOOK: helper function for calculating barycentric coordinates
-__host__ __device__ float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, triangle tri){
-  triangle baryTri;
-  baryTri.p0 = glm::vec3(a,0); baryTri.p1 = glm::vec3(b,0); baryTri.p2 = glm::vec3(c,0);
-  return calculateSignedArea(baryTri)/calculateSignedArea(tri);
-}
-
-//LOOK: calculates barycentric coordinates
-__host__ __device__ glm::vec3 calculateBarycentricCoordinate(triangle tri, glm::vec2 point){
-  float beta  = calculateBarycentricCoordinateValue(glm::vec2(tri.p0.x,tri.p0.y), point, glm::vec2(tri.p2.x,tri.p2.y), tri);
-  float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri.p0.x,tri.p0.y), glm::vec2(tri.p1.x,tri.p1.y), point, tri);
-  float alpha = 1.0-beta-gamma;
-  return glm::vec3(alpha,beta,gamma);
-}
-
-//LOOK: checks if a barycentric coordinate is within the boundaries of a triangle
-__host__ __device__ bool isBarycentricCoordInBounds(glm::vec3 barycentricCoord){
-   return barycentricCoord.x >= 0.0 && barycentricCoord.x <= 1.0 &&
-          barycentricCoord.y >= 0.0 && barycentricCoord.y <= 1.0 &&
-          barycentricCoord.z >= 0.0 && barycentricCoord.z <= 1.0;
-}
-
-//LOOK: for a given barycentric coordinate, return the corresponding z position on the triangle
-__host__ __device__ float getZAtCoordinate(glm::vec3 barycentricCoord, triangle tri){
-  return -(barycentricCoord.x*tri.p0.z + barycentricCoord.y*tri.p1.z + barycentricCoord.z*tri.p2.z);
-}
-
+// CIS565 CUDA Rasterizer: A simple rasterization pipeline for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
+// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
+
+#ifndef RASTERIZETOOLS_H
+#define RASTERIZETOOLS_H
+
+#include <cmath>
+#include "glm/glm.hpp"
+#include "utilities.h"
+#include "cudaMat4.h"
+
+struct triangle {
+  glm::vec3 p0;
+  glm::vec3 p1;
+  glm::vec3 p2;
+  glm::vec3 c0;
+  glm::vec3 c1;
+  glm::vec3 c2;
+  glm::vec3 n1;
+  glm::vec3 n2;
+  glm::vec3 n0;
+};
+
+struct fragment{
+  glm::vec3 color;
+  glm::vec3 normal;
+  glm::vec3 position;
+  int lock;
+};
+
+//Multiplies a cudaMat4 matrix and a vec4
+__host__ __device__ glm::vec3 multiplyMV(cudaMat4 m, glm::vec4 v){
+  glm::vec3 r(1,1,1);
+  r.x = (m.x.x*v.x)+(m.x.y*v.y)+(m.x.z*v.z)+(m.x.w*v.w);
+  r.y = (m.y.x*v.x)+(m.y.y*v.y)+(m.y.z*v.z)+(m.y.w*v.w);
+  r.z = (m.z.x*v.x)+(m.z.y*v.y)+(m.z.z*v.z)+(m.z.w*v.w);
+  return r;
+}
+
+
+//LOOK: finds the axis aligned bounding box for a given triangle
+__host__ __device__ void getAABBForTriangle(triangle tri, glm::vec3& minpoint, glm::vec3& maxpoint){
+  minpoint = glm::vec3(min(min(tri.p0.x, tri.p1.x),tri.p2.x), 
+        min(min(tri.p0.y, tri.p1.y),tri.p2.y),
+        min(min(tri.p0.z, tri.p1.z),tri.p2.z));
+  maxpoint = glm::vec3(max(max(tri.p0.x, tri.p1.x),tri.p2.x), 
+        max(max(tri.p0.y, tri.p1.y),tri.p2.y),
+        max(max(tri.p0.z, tri.p1.z),tri.p2.z));
+}
+
+//LOOK: calculates the signed area of a given triangle
+__host__ __device__ float calculateSignedArea(triangle tri){
+  return 0.5*((tri.p2.x - tri.p0.x)*(tri.p1.y - tri.p0.y) - (tri.p1.x - tri.p0.x)*(tri.p2.y - tri.p0.y));
+}
+
+//LOOK: helper function for calculating barycentric coordinates
+__host__ __device__ float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, triangle tri){
+  triangle baryTri;
+  baryTri.p0 = glm::vec3(a,0); baryTri.p1 = glm::vec3(b,0); baryTri.p2 = glm::vec3(c,0);
+  return calculateSignedArea(baryTri)/calculateSignedArea(tri);
+}
+
+//LOOK: calculates barycentric coordinates
+__host__ __device__ glm::vec3 calculateBarycentricCoordinate(triangle tri, glm::vec2 point){
+  float beta  = calculateBarycentricCoordinateValue(glm::vec2(tri.p0.x,tri.p0.y), point, glm::vec2(tri.p2.x,tri.p2.y), tri);
+  float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri.p0.x,tri.p0.y), glm::vec2(tri.p1.x,tri.p1.y), point, tri);
+  float alpha = 1.0-beta-gamma;
+  return glm::vec3(alpha,beta,gamma);
+}
+
+//LOOK: checks if a barycentric coordinate is within the boundaries of a triangle
+__host__ __device__ bool isBarycentricCoordInBounds(glm::vec3 barycentricCoord){
+   return barycentricCoord.x >= 0.0 && barycentricCoord.x <= 1.0 &&
+          barycentricCoord.y >= 0.0 && barycentricCoord.y <= 1.0 &&
+          barycentricCoord.z >= 0.0 && barycentricCoord.z <= 1.0;
+}
+
+//LOOK: for a given barycentric coordinate, return the corresponding z position on the triangle
+__host__ __device__ float getZAtCoordinate(glm::vec3 barycentricCoord, triangle tri){
+  return -(barycentricCoord.x*tri.p0.z + barycentricCoord.y*tri.p1.z + barycentricCoord.z*tri.p2.z);
+}
+
 #endif
\ No newline at end of file
diff --git a/src/utilities.h b/src/utilities.h
index 3e6ef6e..2416a7f 100755
--- a/src/utilities.h
+++ b/src/utilities.h
@@ -1,44 +1,44 @@
-//UTILITYCORE- A Utility Library by Yining Karl Li
-//This file is part of UTILITYCORE, Coyright (c) 2012 Yining Karl Li
-
-#ifndef Pathtracer_utilities_h
-#define Pathtracer_utilities_h
-
-#include "glm/glm.hpp"
-#include <algorithm>
-#include <istream>
-#include <ostream>
-#include <iterator>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "cudaMat4.h"
-
-const float PI                          =3.1415926535897932384626422832795028841971;
-const float TWO_PI                      =6.2831853071795864769252867665590057683943;
-const float SQRT_OF_ONE_THIRD           =0.5773502691896257645091487805019574556476;
-const float E                           =2.7182818284590452353602874713526624977572;
-const float EPSILON                     =.000000001;
-const float ZERO_ABSORPTION_EPSILON     =0.00001;
-const float RAY_BIAS_AMOUNT             =0.0002;
-
-namespace utilityCore {
-    extern float clamp(float f, float min, float max);
-    extern bool replaceString(std::string& str, const std::string& from, const std::string& to);
-    extern glm::vec3 clampRGB(glm::vec3 color);
-    extern bool epsilonCheck(float a, float b);
-    extern std::vector<std::string> tokenizeString(std::string str); 
-    extern cudaMat4 glmMat4ToCudaMat4(glm::mat4 a);
-    extern glm::mat4 cudaMat4ToGlmMat4(cudaMat4 a);
-    extern glm::mat4 buildTransformationMatrix(glm::vec3 translation, glm::vec3 rotation, glm::vec3 scale);
-    extern void printCudaMat4(cudaMat4 m);
-    extern std::string convertIntToString(int number);
-
-    //-----------------------------
-    //-------GLM Printers----------
-    //-----------------------------
-    extern void printMat4(glm::mat4);
-    extern void printVec4(glm::vec4);
-    extern void printVec3(glm::vec3);
-}
-#endif
+//UTILITYCORE- A Utility Library by Yining Karl Li
+//This file is part of UTILITYCORE, Coyright (c) 2012 Yining Karl Li
+
+#ifndef Pathtracer_utilities_h
+#define Pathtracer_utilities_h
+
+#include "glm/glm.hpp"
+#include <algorithm>
+#include <istream>
+#include <ostream>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "cudaMat4.h"
+
+const float PI                          =3.1415926535897932384626422832795028841971;
+const float TWO_PI                      =6.2831853071795864769252867665590057683943;
+const float SQRT_OF_ONE_THIRD           =0.5773502691896257645091487805019574556476;
+const float E                           =2.7182818284590452353602874713526624977572;
+const float EPSILON                     =.0001;
+const float ZERO_ABSORPTION_EPSILON     =0.00001;
+const float RAY_BIAS_AMOUNT             =0.0002;
+
+namespace utilityCore {
+    extern float clamp(float f, float min, float max);
+    extern bool replaceString(std::string& str, const std::string& from, const std::string& to);
+    extern glm::vec3 clampRGB(glm::vec3 color);
+    extern bool epsilonCheck(float a, float b);
+    extern std::vector<std::string> tokenizeString(std::string str); 
+    extern cudaMat4 glmMat4ToCudaMat4(glm::mat4 a);
+    extern glm::mat4 cudaMat4ToGlmMat4(cudaMat4 a);
+    extern glm::mat4 buildTransformationMatrix(glm::vec3 translation, glm::vec3 rotation, glm::vec3 scale);
+    extern void printCudaMat4(cudaMat4 m);
+    extern std::string convertIntToString(int number);
+
+    //-----------------------------
+    //-------GLM Printers----------
+    //-----------------------------
+    extern void printMat4(glm::mat4);
+    extern void printVec4(glm::vec4);
+    extern void printVec3(glm::vec3);
+}
+#endif