diff --git a/README.md b/README.md
index f044c821..a19bac08 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,55 @@
 CUDA Denoiser For CUDA Path Tracer
 ==================================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
+
+* Edward Zhang
+  * https://www.linkedin.com/in/edwardjczhang/
+  * https://zedward23.github.io/personal_Website/
+ 
+* Tested on: Windows 10 Home, i7-11800H @ 2.3GHz, 16.0GB, NVIDIA GeForce RTX 3060 Laptop GPU
+
+## Denoiser Showcase
+
+Cornell Box - Denoised (10 Iterations)
+![](img/Denoised1.png)
+
+Cornell box with a large ceiling light denoised using an A-Trous filter using differences in color, normal, and position as stored in a frameBuffer to weight the gaussian-approximated neighbors' contributions to the denoised image output.
+
+Cornell Box - Without Denoising (10 Iterations)
+![](img/normal.png)
+
+Standard path-traced output of the same scene for the sake of comparison.
+
+## Outputs from Different Weight Combinations
+
+Only Normals
+
+![](img/normalWieghtOnly.png)
+
+Only Colors
+
+![](img/ColorWeightOnly.png)
+
+Only Positions
+
+![](img/PosWeightOnly.png)
+
+Individually, each component does not seem to guide the denoising process very strongly.
+
+## Runtime Analysis
+
+![](img/denoiseTimeChart.png)
+
+Runtime to denoise a framebuffer using the A-Trous filter increases logarithmically since the total number of neighbors per filter only increases logarithmically due to the nature of our For Loop doubling our step_width each time.
+
+![](img/FrameRenderChart.png)
+
+The denoising does impact the overall pathtrace operations at all; they are separate operations.
+
+### Note:
+When the lights of a scene are super small, the additional black in the screen from non-terminated rays will make the scene signficantly noisier.
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
 
-### (TODO: Your README)
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
 
diff --git a/img/ColorWeightOnly.png b/img/ColorWeightOnly.png
new file mode 100644
index 00000000..68a1fef2
Binary files /dev/null and b/img/ColorWeightOnly.png differ
diff --git a/img/Denoised1.png b/img/Denoised1.png
new file mode 100644
index 00000000..8d46a09f
Binary files /dev/null and b/img/Denoised1.png differ
diff --git a/img/FrameRenderChart.png b/img/FrameRenderChart.png
new file mode 100644
index 00000000..2b901c55
Binary files /dev/null and b/img/FrameRenderChart.png differ
diff --git a/img/PosWeightOnly.png b/img/PosWeightOnly.png
new file mode 100644
index 00000000..ee5761e1
Binary files /dev/null and b/img/PosWeightOnly.png differ
diff --git a/img/denoiseTimeChart.png b/img/denoiseTimeChart.png
new file mode 100644
index 00000000..cc7e81fb
Binary files /dev/null and b/img/denoiseTimeChart.png differ
diff --git a/img/normal.png b/img/normal.png
new file mode 100644
index 00000000..f2b4597b
Binary files /dev/null and b/img/normal.png differ
diff --git a/img/normalWieghtOnly.png b/img/normalWieghtOnly.png
new file mode 100644
index 00000000..e0d8af25
Binary files /dev/null and b/img/normalWieghtOnly.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 4092ae4a..f26dfc8c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -22,8 +22,8 @@ static double lastY;
 int ui_iterations = 0;
 int startupIterations = 0;
 int lastLoopIterations = 0;
-bool ui_showGbuffer = false;
-bool ui_denoise = false;
+bool ui_showGbuffer = true;
+bool ui_denoise = true;
 int ui_filterSize = 80;
 float ui_colorWeight = 0.45f;
 float ui_normalWeight = 0.35f;
@@ -166,7 +166,13 @@ void runCuda() {
     }
 
     if (ui_showGbuffer) {
-      showGBuffer(pbo_dptr);
+        if (ui_denoise) {
+            denoise(ui_filterSize, ui_colorWeight, ui_normalWeight, ui_positionWeight, pbo_dptr, iteration);
+        }
+        else {
+            showGBuffer(pbo_dptr);
+        }
+        
     } else {
       showImage(pbo_dptr, iteration);
     }
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 23e5f909..12da78b0 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -13,9 +13,15 @@
 #include "pathtrace.h"
 #include "intersections.h"
 #include "interactions.h"
+#include "timer.h"
 
 #define ERRORCHECK 1
 
+PerformanceTimer& timer() {
+    static PerformanceTimer timer;
+    return timer;
+}
+
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
@@ -49,7 +55,7 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
         int iter, glm::vec3* image) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
+    
     if (x < resolution.x && y < resolution.y) {
         int index = x + (y * resolution.x);
         glm::vec3 pix = image[index];
@@ -67,18 +73,24 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
     }
 }
 
-__global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
+__global__ void gbufferToPBO(uchar4* pbo, 
+                             glm::ivec2 resolution, 
+                             GBufferPixel* gBuffer) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
     if (x < resolution.x && y < resolution.y) {
         int index = x + (y * resolution.x);
         float timeToIntersect = gBuffer[index].t * 256.0;
+        glm::vec3 norms = glm::normalize(gBuffer[index].norm);
+        glm::vec3 positions = glm::normalize(gBuffer[index].pos);
 
         pbo[index].w = 0;
-        pbo[index].x = timeToIntersect;
-        pbo[index].y = timeToIntersect;
-        pbo[index].z = timeToIntersect;
+        pbo[index].x = (norms[0] + 1.)/ 2.f * 256.f;
+        pbo[index].y = (norms[1] + 1.)/ 2.f * 256.f;
+        pbo[index].z = (norms[2] + 1.)/ 2.f * 256.f;
+
+
     }
 }
 
@@ -91,6 +103,8 @@ static ShadeableIntersection * dev_intersections = NULL;
 static GBufferPixel* dev_gBuffer = NULL;
 // TODO: static variables for device memory, any extra info you need, etc
 // ...
+static glm::vec3* dev_denoised_img = NULL;
+static glm::vec3* dev_denoised_img_out = NULL;
 
 void pathtraceInit(Scene *scene) {
     hst_scene = scene;
@@ -114,6 +128,11 @@ void pathtraceInit(Scene *scene) {
     cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel));
 
     // TODO: initialize any extra device memeory you need
+    //Malloc and Memset for denoised img
+    cudaMalloc(&dev_denoised_img, pixelcount * sizeof(glm::vec3));
+    cudaMemset(dev_denoised_img, 0, pixelcount * sizeof(glm::vec3));
+    cudaMalloc(&dev_denoised_img_out, pixelcount * sizeof(glm::vec3));
+    cudaMemset(dev_denoised_img_out, 0, pixelcount * sizeof(glm::vec3));
 
     checkCUDAError("pathtraceInit");
 }
@@ -126,7 +145,8 @@ void pathtraceFree() {
   	cudaFree(dev_intersections);
     cudaFree(dev_gBuffer);
     // TODO: clean up any extra device memory you created
-
+    cudaFree(dev_denoised_img);
+    cudaFree(dev_denoised_img_out);
     checkCUDAError("pathtraceFree");
 }
 
@@ -282,9 +302,54 @@ __global__ void generateGBuffer (
   if (idx < num_paths)
   {
     gBuffer[idx].t = shadeableIntersections[idx].t;
+    gBuffer[idx].pos = getPointOnRay(pathSegments[idx].ray, shadeableIntersections[idx].t);
+    gBuffer[idx].norm = shadeableIntersections[idx].surfaceNormal;
+
   }
 }
 
+__global__ void atrousFilter(const glm::ivec2 resolution, const glm::vec3* in, glm::vec3* out, const GBufferPixel* gBuffer,
+    float c_phi, float n_phi, float p_phi, float step_width) {
+    int idx_x = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx_y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (idx_x >= resolution.x || idx_y >= resolution.y) {
+        return;
+    }
+    int idx = idx_x + idx_y * resolution.x;
+    glm::vec3 cval = in[idx];
+    glm::vec3 nval = gBuffer[idx].norm;
+    glm::vec3 pval = gBuffer[idx].pos;
+
+    float kernel[3];
+    kernel[0] = .0625f;
+    kernel[1] = .25f;
+    kernel[2] = .375f;
+
+    glm::vec3 sum = glm::vec3(0, 0, 0);
+    float total_weight = 0.0f;
+
+    for (int dy = -2; dy <= 2; ++dy) {
+        for (int dx = -2; dx <= 2; ++dx) {
+            int u = glm::clamp(int(idx_x + dx * step_width), 0, resolution.x);
+            int v = glm::clamp(int(idx_y + dy * step_width), 0, resolution.y);
+            int uvIdx = u + v * resolution.x;
+
+            float c_w = min(exp(-glm::distance(cval, in[uvIdx]) / c_phi), 1.f);
+
+            float n_w = min(exp(-glm::distance(nval, gBuffer[uvIdx].norm) / n_phi), 1.0f);
+
+            float p_w = min(exp(-glm::distance(pval, gBuffer[uvIdx].pos) / p_phi), 1.0f);
+
+            float weight = c_w * n_w * p_w;
+
+            int kernel_idx = min(abs(dx), abs(dy));
+            sum += in[uvIdx] * weight * kernel[kernel_idx];
+            total_weight += weight * kernel[kernel_idx];
+        }
+    }
+    out[idx] = sum / total_weight;
+}
+
 // Add the current iteration's output to the overall image
 __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
 {
@@ -315,36 +380,7 @@ void pathtrace(int frame, int iter) {
 	// 1D block for path tracing
 	const int blockSize1d = 128;
 
-    ///////////////////////////////////////////////////////////////////////////
-
-    // Pathtracing Recap:
-    // * Initialize array of path rays (using rays that come out of the camera)
-    //   * You can pass the Camera object to that kernel.
-    //   * Each path ray must carry at minimum a (ray, color) pair,
-    //   * where color starts as the multiplicative identity, white = (1, 1, 1).
-    //   * This has already been done for you.
-    // * NEW: For the first depth, generate geometry buffers (gbuffers)
-    // * For each depth:
-    //   * Compute an intersection in the scene for each path ray.
-    //     A very naive version of this has been implemented for you, but feel
-    //     free to add more primitives and/or a better algorithm.
-    //     Currently, intersection distance is recorded as a parametric distance,
-    //     t, or a "distance along the ray." t = -1.0 indicates no intersection.
-    //     * Color is attenuated (multiplied) by reflections off of any object
-    //   * Stream compact away all of the terminated paths.
-    //     You may use either your implementation or `thrust::remove_if` or its
-    //     cousins.
-    //     * Note that you can't really use a 2D kernel launch any more - switch
-    //       to 1D.
-    //   * Shade the rays that intersected something or didn't bottom out.
-    //     That is, color the ray by performing a color computation according
-    //     to the shader, then generate a new ray to continue the ray path.
-    //     We recommend just updating the ray's PathSegment in place.
-    //     Note that this step may come before or after stream compaction,
-    //     since some shaders you write may also cause a path to terminate.
-    // * Finally:
-    //     * if not denoising, add this iteration's results to the image
-    //     * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl
+    timer().startGpuTimer();
 
 	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
 	checkCUDAError("generate camera ray");
@@ -407,18 +443,47 @@ void pathtrace(int frame, int iter) {
             pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
     checkCUDAError("pathtrace");
+
+    timer().endGpuTimer();
+    printElapsedTime(timer().getGpuElapsedTimeForPreviousOperation(), "(Frame Render Duration)");
+}
+
+
+void denoise(float filterSize, float c_phi, float n_phi, float p_phi, uchar4* pbo, int iter) {
+    const Camera& cam = hst_scene->state.camera;
+    const dim3 blockSize2d(8, 8);
+    const dim3 blocksPerGrid2d(
+        (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+        (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+    int pixelCount = cam.resolution.x * cam.resolution.y;
+    timer().startGpuTimer(); 
+
+    cudaMemcpy(dev_denoised_img, dev_image, pixelCount * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
+
+    for (int step_width = 1; step_width <= filterSize; step_width *= 2) {
+        atrousFilter << < blocksPerGrid2d, blockSize2d >> > (cam.resolution, dev_denoised_img, dev_denoised_img_out, dev_gBuffer, c_phi, n_phi, p_phi, step_width);
+
+        std::swap(dev_denoised_img, dev_denoised_img_out);
+    }
+
+    timer().endGpuTimer(); 
+    printElapsedTime(timer().getGpuElapsedTimeForPreviousOperation(), "(Denoise Duration)");
+
+    sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_denoised_img);
+    cudaMemcpy(hst_scene->state.image.data(), dev_denoised_img_out, pixelCount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 }
 
 // CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging.
 void showGBuffer(uchar4* pbo) {
-    const Camera &cam = hst_scene->state.camera;
+    const Camera& cam = hst_scene->state.camera;
     const dim3 blockSize2d(8, 8);
     const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+        (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+        (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
     // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
-    gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+    gbufferToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, dev_gBuffer);
 }
 
 void showImage(uchar4* pbo, int iter) {
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 9e12f440..9a33bec9 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -8,3 +8,5 @@ void pathtraceFree();
 void pathtrace(int frame, int iteration);
 void showGBuffer(uchar4 *pbo);
 void showImage(uchar4 *pbo, int iter);
+
+void denoise(float size, float c, float n, float p, uchar4* pbo, int iter);
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index da7e558a..f738d8cd 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -79,4 +79,6 @@ struct ShadeableIntersection {
 // What information might be helpful for guiding a denoising filter?
 struct GBufferPixel {
   float t;
+  glm::vec3 pos;
+  glm::vec3 norm;
 };
diff --git a/src/timer.h b/src/timer.h
new file mode 100644
index 00000000..de9ad0a4
--- /dev/null
+++ b/src/timer.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+#include <chrono>
+#include <iostream>
+#include <stdexcept>
+
+/**
+ * This class is used for timing the performance
+ * Uncopyable and unmovable
+ *
+ * Adapted from shineyruan(https://github.com/shineyruan)
+ */
+class PerformanceTimer {
+public:
+    PerformanceTimer() {
+        cudaEventCreate(&event_start);
+        cudaEventCreate(&event_end);
+    }
+
+    ~PerformanceTimer() {
+        cudaEventDestroy(event_start);
+        cudaEventDestroy(event_end);
+    }
+
+    void startCpuTimer() {
+        if (cpu_timer_started) {
+            throw std::runtime_error("CPU timer already started");
+        }
+        cpu_timer_started = true;
+
+        time_start_cpu = std::chrono::high_resolution_clock::now();
+    }
+
+    void endCpuTimer() {
+        time_end_cpu = std::chrono::high_resolution_clock::now();
+
+        if (!cpu_timer_started) {
+            throw std::runtime_error("CPU timer not started");
+        }
+
+        std::chrono::duration<double, std::milli> duro =
+            time_end_cpu - time_start_cpu;
+        prev_elapsed_time_cpu_milliseconds =
+            static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+        cpu_timer_started = false;
+    }
+
+    void startGpuTimer() {
+        if (gpu_timer_started) {
+            throw std::runtime_error("GPU timer already started");
+        }
+        gpu_timer_started = true;
+
+        cudaEventRecord(event_start);
+    }
+
+    void endGpuTimer() {
+        cudaEventRecord(event_end);
+        cudaEventSynchronize(event_end);
+
+        if (!gpu_timer_started) {
+            throw std::runtime_error("GPU timer not started");
+        }
+
+        cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start,
+            event_end);
+        gpu_timer_started = false;
+    }
+
+    float
+        getCpuElapsedTimeForPreviousOperation()  // noexcept //(damn I need VS 2015
+    {
+        return prev_elapsed_time_cpu_milliseconds;
+    }
+
+    float getGpuElapsedTimeForPreviousOperation()  // noexcept
+    {
+        return prev_elapsed_time_gpu_milliseconds;
+    }
+
+    // remove copy and move functions
+    PerformanceTimer(const PerformanceTimer&) = delete;
+    PerformanceTimer(PerformanceTimer&&) = delete;
+    PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+    PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+private:
+    cudaEvent_t event_start = nullptr;
+    cudaEvent_t event_end = nullptr;
+
+    using time_point_t = std::chrono::high_resolution_clock::time_point;
+    time_point_t time_start_cpu;
+    time_point_t time_end_cpu;
+
+    bool cpu_timer_started = false;
+    bool gpu_timer_started = false;
+
+    float prev_elapsed_time_cpu_milliseconds = 0.f;
+    float prev_elapsed_time_gpu_milliseconds = 0.f;
+};
+
+template <typename T>
+void printElapsedTime(T time, std::string note = "") {
+    std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
+}
\ No newline at end of file