diff --git a/README.md b/README.md index f044c821..a19bac08 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,55 @@ CUDA Denoiser For CUDA Path Tracer ================================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4** +**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3** + +* Edward Zhang + * https://www.linkedin.com/in/edwardjczhang/ + * https://zedward23.github.io/personal_Website/ + +* Tested on: Windows 10 Home, i7-11800H @ 2.3GHz, 16.0GB, NVIDIA GeForce RTX 3060 Laptop GPU + +## Denoiser Showcase + +Cornell Box - Denoised (10 Iterations) +![](img/Denoised1.png) + +Cornell box with a large ceiling light denoised using an A-Trous filter using differences in color, normal, and position as stored in a frameBuffer to weight the gaussian-approximated neighbors' contributions to the denoised image output. + +Cornell Box - Without Denoising (10 Iterations) +![](img/normal.png) + +Standard path-traced output of the same scene for the sake of comparison. + +## Outputs from Different Weight Combinations + +Only Normals + +![](img/normalWieghtOnly.png) + +Only Colors + +![](img/ColorWeightOnly.png) + +Only Positions + +![](img/PosWeightOnly.png) + +Individually, each component does not seem to guide the denoising process very strongly. + +## Runtime Analysis + +![](img/denoiseTimeChart.png) + +Runtime to denoise a framebuffer using the A-Trous filter increases logarithmically since the total number of neighbors per filter only increases logarithmically due to the nature of our For Loop doubling our step_width each time. + +![](img/FrameRenderChart.png) + +The denoising does impact the overall pathtrace operations at all; they are separate operations. + +### Note: +When the lights of a scene are super small, the additional black in the screen from non-terminated rays will make the scene signficantly noisier. -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) -### (TODO: Your README) -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. diff --git a/img/ColorWeightOnly.png b/img/ColorWeightOnly.png new file mode 100644 index 00000000..68a1fef2 Binary files /dev/null and b/img/ColorWeightOnly.png differ diff --git a/img/Denoised1.png b/img/Denoised1.png new file mode 100644 index 00000000..8d46a09f Binary files /dev/null and b/img/Denoised1.png differ diff --git a/img/FrameRenderChart.png b/img/FrameRenderChart.png new file mode 100644 index 00000000..2b901c55 Binary files /dev/null and b/img/FrameRenderChart.png differ diff --git a/img/PosWeightOnly.png b/img/PosWeightOnly.png new file mode 100644 index 00000000..ee5761e1 Binary files /dev/null and b/img/PosWeightOnly.png differ diff --git a/img/denoiseTimeChart.png b/img/denoiseTimeChart.png new file mode 100644 index 00000000..cc7e81fb Binary files /dev/null and b/img/denoiseTimeChart.png differ diff --git a/img/normal.png b/img/normal.png new file mode 100644 index 00000000..f2b4597b Binary files /dev/null and b/img/normal.png differ diff --git a/img/normalWieghtOnly.png b/img/normalWieghtOnly.png new file mode 100644 index 00000000..e0d8af25 Binary files /dev/null and b/img/normalWieghtOnly.png differ diff --git a/src/main.cpp b/src/main.cpp index 4092ae4a..f26dfc8c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,8 +22,8 @@ static double lastY; int ui_iterations = 0; int startupIterations = 0; int lastLoopIterations = 0; -bool ui_showGbuffer = false; -bool ui_denoise = false; +bool ui_showGbuffer = true; +bool ui_denoise = true; int ui_filterSize = 80; float ui_colorWeight = 0.45f; float ui_normalWeight = 0.35f; @@ -166,7 +166,13 @@ void runCuda() { } if (ui_showGbuffer) { - showGBuffer(pbo_dptr); + if (ui_denoise) { + denoise(ui_filterSize, ui_colorWeight, ui_normalWeight, ui_positionWeight, pbo_dptr, iteration); + } + else { + showGBuffer(pbo_dptr); + } + } else { showImage(pbo_dptr, iteration); } diff --git a/src/pathtrace.cu b/src/pathtrace.cu index 23e5f909..12da78b0 100644 --- a/src/pathtrace.cu +++ b/src/pathtrace.cu @@ -13,9 +13,15 @@ #include "pathtrace.h" #include "intersections.h" #include "interactions.h" +#include "timer.h" #define ERRORCHECK 1 +PerformanceTimer& timer() { + static PerformanceTimer timer; + return timer; +} + #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) void checkCUDAErrorFn(const char *msg, const char *file, int line) { @@ -49,7 +55,7 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution, int iter, glm::vec3* image) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; - + if (x < resolution.x && y < resolution.y) { int index = x + (y * resolution.x); glm::vec3 pix = image[index]; @@ -67,18 +73,24 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution, } } -__global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) { +__global__ void gbufferToPBO(uchar4* pbo, + glm::ivec2 resolution, + GBufferPixel* gBuffer) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; if (x < resolution.x && y < resolution.y) { int index = x + (y * resolution.x); float timeToIntersect = gBuffer[index].t * 256.0; + glm::vec3 norms = glm::normalize(gBuffer[index].norm); + glm::vec3 positions = glm::normalize(gBuffer[index].pos); pbo[index].w = 0; - pbo[index].x = timeToIntersect; - pbo[index].y = timeToIntersect; - pbo[index].z = timeToIntersect; + pbo[index].x = (norms[0] + 1.)/ 2.f * 256.f; + pbo[index].y = (norms[1] + 1.)/ 2.f * 256.f; + pbo[index].z = (norms[2] + 1.)/ 2.f * 256.f; + + } } @@ -91,6 +103,8 @@ static ShadeableIntersection * dev_intersections = NULL; static GBufferPixel* dev_gBuffer = NULL; // TODO: static variables for device memory, any extra info you need, etc // ... +static glm::vec3* dev_denoised_img = NULL; +static glm::vec3* dev_denoised_img_out = NULL; void pathtraceInit(Scene *scene) { hst_scene = scene; @@ -114,6 +128,11 @@ void pathtraceInit(Scene *scene) { cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel)); // TODO: initialize any extra device memeory you need + //Malloc and Memset for denoised img + cudaMalloc(&dev_denoised_img, pixelcount * sizeof(glm::vec3)); + cudaMemset(dev_denoised_img, 0, pixelcount * sizeof(glm::vec3)); + cudaMalloc(&dev_denoised_img_out, pixelcount * sizeof(glm::vec3)); + cudaMemset(dev_denoised_img_out, 0, pixelcount * sizeof(glm::vec3)); checkCUDAError("pathtraceInit"); } @@ -126,7 +145,8 @@ void pathtraceFree() { cudaFree(dev_intersections); cudaFree(dev_gBuffer); // TODO: clean up any extra device memory you created - + cudaFree(dev_denoised_img); + cudaFree(dev_denoised_img_out); checkCUDAError("pathtraceFree"); } @@ -282,9 +302,54 @@ __global__ void generateGBuffer ( if (idx < num_paths) { gBuffer[idx].t = shadeableIntersections[idx].t; + gBuffer[idx].pos = getPointOnRay(pathSegments[idx].ray, shadeableIntersections[idx].t); + gBuffer[idx].norm = shadeableIntersections[idx].surfaceNormal; + } } +__global__ void atrousFilter(const glm::ivec2 resolution, const glm::vec3* in, glm::vec3* out, const GBufferPixel* gBuffer, + float c_phi, float n_phi, float p_phi, float step_width) { + int idx_x = blockIdx.x * blockDim.x + threadIdx.x; + int idx_y = blockIdx.y * blockDim.y + threadIdx.y; + if (idx_x >= resolution.x || idx_y >= resolution.y) { + return; + } + int idx = idx_x + idx_y * resolution.x; + glm::vec3 cval = in[idx]; + glm::vec3 nval = gBuffer[idx].norm; + glm::vec3 pval = gBuffer[idx].pos; + + float kernel[3]; + kernel[0] = .0625f; + kernel[1] = .25f; + kernel[2] = .375f; + + glm::vec3 sum = glm::vec3(0, 0, 0); + float total_weight = 0.0f; + + for (int dy = -2; dy <= 2; ++dy) { + for (int dx = -2; dx <= 2; ++dx) { + int u = glm::clamp(int(idx_x + dx * step_width), 0, resolution.x); + int v = glm::clamp(int(idx_y + dy * step_width), 0, resolution.y); + int uvIdx = u + v * resolution.x; + + float c_w = min(exp(-glm::distance(cval, in[uvIdx]) / c_phi), 1.f); + + float n_w = min(exp(-glm::distance(nval, gBuffer[uvIdx].norm) / n_phi), 1.0f); + + float p_w = min(exp(-glm::distance(pval, gBuffer[uvIdx].pos) / p_phi), 1.0f); + + float weight = c_w * n_w * p_w; + + int kernel_idx = min(abs(dx), abs(dy)); + sum += in[uvIdx] * weight * kernel[kernel_idx]; + total_weight += weight * kernel[kernel_idx]; + } + } + out[idx] = sum / total_weight; +} + // Add the current iteration's output to the overall image __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths) { @@ -315,36 +380,7 @@ void pathtrace(int frame, int iter) { // 1D block for path tracing const int blockSize1d = 128; - /////////////////////////////////////////////////////////////////////////// - - // Pathtracing Recap: - // * Initialize array of path rays (using rays that come out of the camera) - // * You can pass the Camera object to that kernel. - // * Each path ray must carry at minimum a (ray, color) pair, - // * where color starts as the multiplicative identity, white = (1, 1, 1). - // * This has already been done for you. - // * NEW: For the first depth, generate geometry buffers (gbuffers) - // * For each depth: - // * Compute an intersection in the scene for each path ray. - // A very naive version of this has been implemented for you, but feel - // free to add more primitives and/or a better algorithm. - // Currently, intersection distance is recorded as a parametric distance, - // t, or a "distance along the ray." t = -1.0 indicates no intersection. - // * Color is attenuated (multiplied) by reflections off of any object - // * Stream compact away all of the terminated paths. - // You may use either your implementation or `thrust::remove_if` or its - // cousins. - // * Note that you can't really use a 2D kernel launch any more - switch - // to 1D. - // * Shade the rays that intersected something or didn't bottom out. - // That is, color the ray by performing a color computation according - // to the shader, then generate a new ray to continue the ray path. - // We recommend just updating the ray's PathSegment in place. - // Note that this step may come before or after stream compaction, - // since some shaders you write may also cause a path to terminate. - // * Finally: - // * if not denoising, add this iteration's results to the image - // * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl + timer().startGpuTimer(); generateRayFromCamera <<>>(cam, iter, traceDepth, dev_paths); checkCUDAError("generate camera ray"); @@ -407,18 +443,47 @@ void pathtrace(int frame, int iter) { pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost); checkCUDAError("pathtrace"); + + timer().endGpuTimer(); + printElapsedTime(timer().getGpuElapsedTimeForPreviousOperation(), "(Frame Render Duration)"); +} + + +void denoise(float filterSize, float c_phi, float n_phi, float p_phi, uchar4* pbo, int iter) { + const Camera& cam = hst_scene->state.camera; + const dim3 blockSize2d(8, 8); + const dim3 blocksPerGrid2d( + (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, + (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); + + int pixelCount = cam.resolution.x * cam.resolution.y; + timer().startGpuTimer(); + + cudaMemcpy(dev_denoised_img, dev_image, pixelCount * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); + + for (int step_width = 1; step_width <= filterSize; step_width *= 2) { + atrousFilter << < blocksPerGrid2d, blockSize2d >> > (cam.resolution, dev_denoised_img, dev_denoised_img_out, dev_gBuffer, c_phi, n_phi, p_phi, step_width); + + std::swap(dev_denoised_img, dev_denoised_img_out); + } + + timer().endGpuTimer(); + printElapsedTime(timer().getGpuElapsedTimeForPreviousOperation(), "(Denoise Duration)"); + + sendImageToPBO << > > (pbo, cam.resolution, iter, dev_denoised_img); + cudaMemcpy(hst_scene->state.image.data(), dev_denoised_img_out, pixelCount * sizeof(glm::vec3), cudaMemcpyDeviceToHost); } // CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging. void showGBuffer(uchar4* pbo) { - const Camera &cam = hst_scene->state.camera; + const Camera& cam = hst_scene->state.camera; const dim3 blockSize2d(8, 8); const dim3 blocksPerGrid2d( - (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, - (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); + (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, + (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization - gbufferToPBO<<>>(pbo, cam.resolution, dev_gBuffer); + gbufferToPBO << > > (pbo, cam.resolution, dev_gBuffer); } void showImage(uchar4* pbo, int iter) { diff --git a/src/pathtrace.h b/src/pathtrace.h index 9e12f440..9a33bec9 100644 --- a/src/pathtrace.h +++ b/src/pathtrace.h @@ -8,3 +8,5 @@ void pathtraceFree(); void pathtrace(int frame, int iteration); void showGBuffer(uchar4 *pbo); void showImage(uchar4 *pbo, int iter); + +void denoise(float size, float c, float n, float p, uchar4* pbo, int iter); diff --git a/src/sceneStructs.h b/src/sceneStructs.h index da7e558a..f738d8cd 100644 --- a/src/sceneStructs.h +++ b/src/sceneStructs.h @@ -79,4 +79,6 @@ struct ShadeableIntersection { // What information might be helpful for guiding a denoising filter? struct GBufferPixel { float t; + glm::vec3 pos; + glm::vec3 norm; }; diff --git a/src/timer.h b/src/timer.h new file mode 100644 index 00000000..de9ad0a4 --- /dev/null +++ b/src/timer.h @@ -0,0 +1,110 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include + +/** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from shineyruan(https://github.com/shineyruan) + */ +class PerformanceTimer { +public: + PerformanceTimer() { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() { + if (cpu_timer_started) { + throw std::runtime_error("CPU timer already started"); + } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { + throw std::runtime_error("CPU timer not started"); + } + + std::chrono::duration duro = + time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() { + if (gpu_timer_started) { + throw std::runtime_error("GPU timer already started"); + } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { + throw std::runtime_error("GPU timer not started"); + } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, + event_end); + gpu_timer_started = false; + } + + float + getCpuElapsedTimeForPreviousOperation() // noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() // noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + +private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; +}; + +template +void printElapsedTime(T time, std::string note = "") { + std::cout << " elapsed time: " << time << "ms " << note << std::endl; +} \ No newline at end of file